diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..54d4577
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,106 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.idea/
+
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..04849fe
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018-2023 OpenMMLab, astra-vision, valeoai. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2023 OpenMMLab.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/configs/PODA/cityscapes_detection.py b/configs/PODA/cityscapes_detection.py
new file mode 100755
index 0000000..b66211c
--- /dev/null
+++ b/configs/PODA/cityscapes_detection.py
@@ -0,0 +1,57 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+img_norm_cfg = dict(
+    mean=[122.771, 116.746, 104.094], std=[68.501, 66.632, 70.323], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=8,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root +
+            'annotations/instancesonly_filtered_gtFine_train.json',
+            img_prefix=data_root + 'leftImg8bit/train/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root + 'leftImg8bit/val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_test.json',
+        img_prefix=data_root + 'leftImg8bit/test/',
+        pipeline=test_pipeline))
+# evaluation = dict(interval=1, metric='bbox')
+evaluation = dict(interval=1, classwise=True, iou_thrs=[0.5], metric='bbox')
\ No newline at end of file
diff --git a/configs/PODA/diverse_weather_detection.py b/configs/PODA/diverse_weather_detection.py
new file mode 100755
index 0000000..faf6acb
--- /dev/null
+++ b/configs/PODA/diverse_weather_detection.py
@@ -0,0 +1,63 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+# dataset settings
+dataset_type = 'DiverseWeatherDataset'
+data_root = 'data/diverse_weather/'
+img_norm_cfg = dict(
+    mean=[122.771, 116.746, 104.094], std=[68.501, 66.632, 70.323], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=1,
+        dataset=dict(
+            type=dataset_type,
+            min_size=1,
+            ann_file=[
+                data_root + 'daytime_clear_new/VOC2007/ImageSets/Main/train.txt',
+            ],
+            img_prefix=[data_root + 'daytime_clear_new/VOC2007/'],
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'daytime_clear_new/VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'daytime_clear_new/VOC2007/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'daytime_clear_new/VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'daytime_clear_new/VOC2007/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='mAP')
diff --git a/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3.py b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3.py
new file mode 100755
index 0000000..283d991
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3.py
@@ -0,0 +1,65 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './diverse_weather_detection.py',
+    '../_base_/default_runtime.py'
+]
+custom_hooks = [
+    dict(type='CheckInvalidLossHook', interval=1)
+]
+model = dict(
+    backbone=dict(
+        type='ModifiedResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='./checkpoints/clip_visual_encoder_resnet101_bare.pth')),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=7,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[8])
+    # warmup='linear',
+    # warmup_iters=2000,
+    # warmup_ratio=0.001)
+    # [7] yields higher performance than [6]
+    
+runner = dict(
+    type='EpochBasedRunner', max_epochs=20)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=1)
+# For better, more stable performance initialize from COCO
+load_from = './checkpoints/clip_visual_encoder_resnet101.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=2)
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP.py b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP.py
new file mode 100755
index 0000000..ff85cc9
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP.py
@@ -0,0 +1,65 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './diverse_weather_detection.py',
+    '../_base_/default_runtime.py'
+]
+custom_hooks = [
+    dict(type='CheckInvalidLossHook', interval=1)
+]
+model = dict(
+    backbone=dict(
+        type='ModifiedResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=2,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=7,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.0004, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[8])
+    # warmup='linear',
+    # warmup_iters=2000,
+    # warmup_ratio=0.001)
+    # [7] yields higher performance than [6]
+    
+runner = dict(
+    type='EpochBasedRunner', max_epochs=20)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=1)
+# For better, more stable performance initialize from COCO
+load_from = './work_dirs/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3/latest.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=2)
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_dayfog_lr4e-4_finetuneCLIP.py b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_dayfog_lr4e-4_finetuneCLIP.py
new file mode 100755
index 0000000..d8504f6
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_dayfog_lr4e-4_finetuneCLIP.py
@@ -0,0 +1,84 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './diverse_weather_detection.py',
+    '../_base_/default_runtime.py'
+]
+custom_hooks = [
+    dict(type='CheckInvalidLossHook', interval=1)
+]
+model = dict(
+    backbone=dict(
+        type='ModifiedResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=2,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        target_domain = 'diverse_dayfog_101',
+        augmented_layer = 1,
+        mixing_style = False,
+        init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=7,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.0004, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[8])
+    # warmup='linear',
+    # warmup_iters=2000,
+    # warmup_ratio=0.001)
+    # [7] yields higher performance than [6]
+    
+runner = dict(
+    type='EpochBasedRunner', max_epochs=20)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=1)
+# For better, more stable performance initialize from COCO
+load_from = './work_dirs/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP/latest.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=2)
+
+## override for testing
+dataset_type = 'DiverseWeatherDataset'
+data_root_target = 'data/diverse_weather/daytime_foggy/'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'VOC2007/ImageSets/Main/train.txt', #ann_file=data_root_target + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root_target + 'VOC2007/'),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'VOC2007/ImageSets/Main/train.txt', #ann_file=data_root_target + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root_target + 'VOC2007/'))
+# evaluation=dict(classwise=True, iou_thrs=[0.5], metric='bbox')
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_duskrain_lr4e-4_finetuneCLIP.py b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_duskrain_lr4e-4_finetuneCLIP.py
new file mode 100755
index 0000000..cd183e7
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_duskrain_lr4e-4_finetuneCLIP.py
@@ -0,0 +1,83 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './diverse_weather_detection.py',
+    '../_base_/default_runtime.py'
+]
+custom_hooks = [
+    dict(type='CheckInvalidLossHook', interval=1)
+]
+model = dict(
+    backbone=dict(
+        type='ModifiedResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=2,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        target_domain = 'diverse_duskrain_101',
+        augmented_layer = 1,
+        mixing_style = False,
+        init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=7,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.0004, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[8])
+    # warmup='linear',
+    # warmup_iters=2000,
+    # warmup_ratio=0.001)
+    # [7] yields higher performance than [6]
+    
+runner = dict(
+    type='EpochBasedRunner', max_epochs=20)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=1)
+# For better, more stable performance initialize from COCO
+load_from = './work_dirs/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP/latest.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=2)
+
+## override for testing
+dataset_type = 'DiverseWeatherDataset'
+data_root_target = 'data/diverse_weather/dusk_rainy/'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'VOC2007/ImageSets/Main/train.txt',
+        img_prefix=data_root_target + 'VOC2007/'),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'VOC2007/ImageSets/Main/train.txt',
+        img_prefix=data_root_target + 'VOC2007/'))
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_night_lr4e-4_finetuneCLIP.py b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_night_lr4e-4_finetuneCLIP.py
new file mode 100755
index 0000000..35505bc
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_night_lr4e-4_finetuneCLIP.py
@@ -0,0 +1,83 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './diverse_weather_detection.py',
+    '../_base_/default_runtime.py'
+]
+custom_hooks = [
+    dict(type='CheckInvalidLossHook', interval=1)
+]
+model = dict(
+    backbone=dict(
+        type='ModifiedResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=2,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        target_domain = 'diverse_night_101',
+        augmented_layer = 1,
+        mixing_style = False,
+        init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=7,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.0004, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[8])
+    # warmup='linear',
+    # warmup_iters=2000,
+    # warmup_ratio=0.001)
+    # [7] yields higher performance than [6]
+    
+runner = dict(
+    type='EpochBasedRunner', max_epochs=20)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=1)
+# For better, more stable performance initialize from COCO
+load_from = './work_dirs/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP/latest.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=2)
+
+## override for testing
+dataset_type = 'DiverseWeatherDataset'
+data_root_target = 'data/diverse_weather/Night-Sunny/'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'ImageSets/Main/train.txt', #ann_file=data_root_target + 'ImageSets/Main/test.txt',
+        img_prefix=data_root_target),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'ImageSets/Main/train.txt', #ann_file=data_root_target + 'ImageSets/Main/test.txt',
+        img_prefix=data_root_target))
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_nightrain_lr4e-4_finetuneCLIP.py b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_nightrain_lr4e-4_finetuneCLIP.py
new file mode 100755
index 0000000..89e25bc
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP_latest_PODA_nightrain_lr4e-4_finetuneCLIP.py
@@ -0,0 +1,83 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './diverse_weather_detection.py',
+    '../_base_/default_runtime.py'
+]
+custom_hooks = [
+    dict(type='CheckInvalidLossHook', interval=1)
+]
+model = dict(
+    backbone=dict(
+        type='ModifiedResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=2,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        target_domain = 'diverse_nightrain_101',
+        augmented_layer = 1,
+        mixing_style = False,
+        init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=7,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.0004, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[8])
+    # warmup='linear',
+    # warmup_iters=2000,
+    # warmup_ratio=0.001)
+    # [7] yields higher performance than [6]
+    
+runner = dict(
+    type='EpochBasedRunner', max_epochs=20)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=1)
+# For better, more stable performance initialize from COCO
+load_from = '/root/workspace/Pycharm/mmdetection/work_dirs/faster_rcnn_r101_fpn_1x_pretrainedCLIP_diverseweather_dayclearnew_lr4e-3_srconly_lr4e-4_finetuneCLIP/latest.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=2)
+
+## override for testing
+dataset_type = 'DiverseWeatherDataset'
+data_root_target = 'data/diverse_weather/night_rainy/'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'VOC2007/ImageSets/Main/train.txt',
+        img_prefix=data_root_target + 'VOC2007/'),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root_target + 'VOC2007/ImageSets/Main/train.txt',
+        img_prefix=data_root_target + 'VOC2007/'))
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r50_fpn_1x_pretrainedCLIP_cityscapes.py b/configs/PODA/faster_rcnn_r50_fpn_1x_pretrainedCLIP_cityscapes.py
new file mode 100755
index 0000000..28d8a7c
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r50_fpn_1x_pretrainedCLIP_cityscapes.py
@@ -0,0 +1,71 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './cityscapes_detection.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=8,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    # [7] yields higher performance than [6]
+    step=[7])
+runner = dict(
+    type='EpochBasedRunner', max_epochs=8)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=100)
+# For better, more stable performance initialize from COCO
+load_from = './checkpoints/clip_visual_encoder_resnet50.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
+
+## override for testing
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+data_root_target = 'data/target_domains/cityscapes_foggy/val/'
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root_target),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root_target))
+
+evaluation=dict(classwise=True, iou_thrs=[0.5], metric='bbox')
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r50_fpn_1x_pretrainedCLIP_cityscapes_PODA_fog.py b/configs/PODA/faster_rcnn_r50_fpn_1x_pretrainedCLIP_cityscapes_PODA_fog.py
new file mode 100755
index 0000000..4303f61
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r50_fpn_1x_pretrainedCLIP_cityscapes_PODA_fog.py
@@ -0,0 +1,80 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+_base_ = [
+    './faster_rcnn_r50_fpn_pretrainedCLIP.py',
+    './cityscapes_detection.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(type='ModifiedResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        target_domain = 'fog',
+        augmented_layer = 1,
+        mixing_style = False,
+        init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=8,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[7])
+runner = dict(
+    type='EpochBasedRunner', max_epochs=8) 
+log_config = dict(interval=100)
+load_from = './work_dirs/faster_rcnn_r50_fpn_1x_pretrainedCLIP_cityscapes/latest.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
+
+## override for testing
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+data_root_target = 'data/target_domains/cityscapes_foggy/val/'
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root_target),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root_target))
+
+evaluation=dict(classwise=True, iou_thrs=[0.5], metric='bbox')
\ No newline at end of file
diff --git a/configs/PODA/faster_rcnn_r50_fpn_pretrainedCLIP.py b/configs/PODA/faster_rcnn_r50_fpn_pretrainedCLIP.py
new file mode 100755
index 0000000..4819ac9
--- /dev/null
+++ b/configs/PODA/faster_rcnn_r50_fpn_pretrainedCLIP.py
@@ -0,0 +1,114 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+model = dict(
+    type='FasterRCNN',
+    backbone=dict(
+        type='ModifiedResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='./checkpoints/clip_visual_encoder_resnet50.pth')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
diff --git a/configs/_base_/datasets/cityscapes_detection.py b/configs/_base_/datasets/cityscapes_detection.py
new file mode 100755
index 0000000..e341b59
--- /dev/null
+++ b/configs/_base_/datasets/cityscapes_detection.py
@@ -0,0 +1,56 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=8,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root +
+            'annotations/instancesonly_filtered_gtFine_train.json',
+            img_prefix=data_root + 'leftImg8bit/train/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root + 'leftImg8bit/val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_test.json',
+        img_prefix=data_root + 'leftImg8bit/test/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/configs/_base_/datasets/cityscapes_instance.py b/configs/_base_/datasets/cityscapes_instance.py
new file mode 100755
index 0000000..4e3c34e
--- /dev/null
+++ b/configs/_base_/datasets/cityscapes_instance.py
@@ -0,0 +1,56 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=8,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root +
+            'annotations/instancesonly_filtered_gtFine_train.json',
+            img_prefix=data_root + 'leftImg8bit/train/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root + 'leftImg8bit/val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_test.json',
+        img_prefix=data_root + 'leftImg8bit/test/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/configs/_base_/datasets/coco_detection.py b/configs/_base_/datasets/coco_detection.py
new file mode 100755
index 0000000..149f590
--- /dev/null
+++ b/configs/_base_/datasets/coco_detection.py
@@ -0,0 +1,49 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/configs/_base_/datasets/coco_instance.py b/configs/_base_/datasets/coco_instance.py
new file mode 100755
index 0000000..9901a85
--- /dev/null
+++ b/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,49 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/configs/_base_/datasets/coco_instance_semantic.py b/configs/_base_/datasets/coco_instance_semantic.py
new file mode 100755
index 0000000..6c8bf07
--- /dev/null
+++ b/configs/_base_/datasets/coco_instance_semantic.py
@@ -0,0 +1,54 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 8),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        seg_prefix=data_root + 'stuffthingmaps/train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/configs/_base_/datasets/coco_panoptic.py b/configs/_base_/datasets/coco_panoptic.py
new file mode 100755
index 0000000..dbade7c
--- /dev/null
+++ b/configs/_base_/datasets/coco_panoptic.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'CocoPanopticDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 4),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/panoptic_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        seg_prefix=data_root + 'annotations/panoptic_train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/panoptic_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        seg_prefix=data_root + 'annotations/panoptic_val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/panoptic_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        seg_prefix=data_root + 'annotations/panoptic_val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric=['PQ'])
diff --git a/configs/_base_/datasets/deepfashion.py b/configs/_base_/datasets/deepfashion.py
new file mode 100755
index 0000000..308b4b2
--- /dev/null
+++ b/configs/_base_/datasets/deepfashion.py
@@ -0,0 +1,53 @@
+# dataset settings
+dataset_type = 'DeepFashionDataset'
+data_root = 'data/DeepFashion/In-shop/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(750, 1101), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(750, 1101),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=1,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
+        img_prefix=data_root + 'Img/',
+        pipeline=train_pipeline,
+        data_root=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
+        img_prefix=data_root + 'Img/',
+        pipeline=test_pipeline,
+        data_root=data_root),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/DeepFashion_segmentation_gallery.json',
+        img_prefix=data_root + 'Img/',
+        pipeline=test_pipeline,
+        data_root=data_root))
+evaluation = dict(interval=5, metric=['bbox', 'segm'])
diff --git a/configs/_base_/datasets/lvis_v0.5_instance.py b/configs/_base_/datasets/lvis_v0.5_instance.py
new file mode 100755
index 0000000..207e005
--- /dev/null
+++ b/configs/_base_/datasets/lvis_v0.5_instance.py
@@ -0,0 +1,24 @@
+# dataset settings
+_base_ = 'coco_instance.py'
+dataset_type = 'LVISV05Dataset'
+data_root = 'data/lvis_v0.5/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        _delete_=True,
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/lvis_v0.5_train.json',
+            img_prefix=data_root + 'train2017/')),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v0.5_val.json',
+        img_prefix=data_root + 'val2017/'),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v0.5_val.json',
+        img_prefix=data_root + 'val2017/'))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/configs/_base_/datasets/lvis_v1_instance.py b/configs/_base_/datasets/lvis_v1_instance.py
new file mode 100755
index 0000000..be791ed
--- /dev/null
+++ b/configs/_base_/datasets/lvis_v1_instance.py
@@ -0,0 +1,24 @@
+# dataset settings
+_base_ = 'coco_instance.py'
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/lvis_v1/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        _delete_=True,
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/lvis_v1_train.json',
+            img_prefix=data_root)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/configs/_base_/datasets/objects365v1_detection.py b/configs/_base_/datasets/objects365v1_detection.py
new file mode 100755
index 0000000..8989b6f
--- /dev/null
+++ b/configs/_base_/datasets/objects365v1_detection.py
@@ -0,0 +1,49 @@
+# dataset settings
+dataset_type = 'Objects365V1Dataset'
+data_root = 'data/Objects365/Obj365_v1/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/objects365_train.json',
+        img_prefix=data_root + 'train/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/objects365_val.json',
+        img_prefix=data_root + 'val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/objects365_val.json',
+        img_prefix=data_root + 'val/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/configs/_base_/datasets/objects365v2_detection.py b/configs/_base_/datasets/objects365v2_detection.py
new file mode 100755
index 0000000..99942c1
--- /dev/null
+++ b/configs/_base_/datasets/objects365v2_detection.py
@@ -0,0 +1,49 @@
+# dataset settings
+dataset_type = 'Objects365V2Dataset'
+data_root = 'data/Objects365/Obj365_v2/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/zhiyuan_objv2_train.json',
+        img_prefix=data_root + 'train/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/zhiyuan_objv2_val.json',
+        img_prefix=data_root + 'val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/zhiyuan_objv2_val.json',
+        img_prefix=data_root + 'val/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/configs/_base_/datasets/openimages_detection.py b/configs/_base_/datasets/openimages_detection.py
new file mode 100755
index 0000000..a65d306
--- /dev/null
+++ b/configs/_base_/datasets/openimages_detection.py
@@ -0,0 +1,65 @@
+# dataset settings
+dataset_type = 'OpenImagesDataset'
+data_root = 'data/OpenImages/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, denorm_bbox=True),
+    dict(type='Resize', img_scale=(1024, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1024, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=0,  # workers_per_gpu > 0 may occur out of memory
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/oidv6-train-annotations-bbox.csv',
+        img_prefix=data_root + 'OpenImages/train/',
+        label_file=data_root + 'annotations/class-descriptions-boxable.csv',
+        hierarchy_file=data_root +
+        'annotations/bbox_labels_600_hierarchy.json',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/validation-annotations-bbox.csv',
+        img_prefix=data_root + 'OpenImages/validation/',
+        label_file=data_root + 'annotations/class-descriptions-boxable.csv',
+        hierarchy_file=data_root +
+        'annotations/bbox_labels_600_hierarchy.json',
+        meta_file=data_root + 'annotations/validation-image-metas.pkl',
+        image_level_ann_file=data_root +
+        'annotations/validation-annotations-human-imagelabels-boxable.csv',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/validation-annotations-bbox.csv',
+        img_prefix=data_root + 'OpenImages/validation/',
+        label_file=data_root + 'annotations/class-descriptions-boxable.csv',
+        hierarchy_file=data_root +
+        'annotations/bbox_labels_600_hierarchy.json',
+        meta_file=data_root + 'annotations/validation-image-metas.pkl',
+        image_level_ann_file=data_root +
+        'annotations/validation-annotations-human-imagelabels-boxable.csv',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='mAP')
diff --git a/configs/_base_/datasets/voc0712.py b/configs/_base_/datasets/voc0712.py
new file mode 100755
index 0000000..ae09acd
--- /dev/null
+++ b/configs/_base_/datasets/voc0712.py
@@ -0,0 +1,55 @@
+# dataset settings
+dataset_type = 'VOCDataset'
+data_root = 'data/VOCdevkit/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1000, 600), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1000, 600),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=[
+                data_root + 'VOC2007/ImageSets/Main/trainval.txt',
+                data_root + 'VOC2012/ImageSets/Main/trainval.txt'
+            ],
+            img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'VOC2007/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'VOC2007/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='mAP')
diff --git a/configs/_base_/datasets/wider_face.py b/configs/_base_/datasets/wider_face.py
new file mode 100755
index 0000000..d1d649b
--- /dev/null
+++ b/configs/_base_/datasets/wider_face.py
@@ -0,0 +1,63 @@
+# dataset settings
+dataset_type = 'WIDERFaceDataset'
+data_root = 'data/WIDERFace/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=60,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'train.txt',
+            img_prefix=data_root + 'WIDER_train/',
+            min_size=17,
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'val.txt',
+        img_prefix=data_root + 'WIDER_val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'val.txt',
+        img_prefix=data_root + 'WIDER_val/',
+        pipeline=test_pipeline))
diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
new file mode 100755
index 0000000..5b0b145
--- /dev/null
+++ b/configs/_base_/default_runtime.py
@@ -0,0 +1,27 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+custom_hooks = [dict(type='NumClassCheckHook')]
+
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/configs/_base_/models/ascend_retinanet_r50_fpn.py b/configs/_base_/models/ascend_retinanet_r50_fpn.py
new file mode 100755
index 0000000..9a18fd7
--- /dev/null
+++ b/configs/_base_/models/ascend_retinanet_r50_fpn.py
@@ -0,0 +1,60 @@
+# model settings
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type='AscendRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='AscendMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/configs/_base_/models/ascend_ssd300.py b/configs/_base_/models/ascend_ssd300.py
new file mode 100755
index 0000000..1f91304
--- /dev/null
+++ b/configs/_base_/models/ascend_ssd300.py
@@ -0,0 +1,56 @@
+# model settings
+input_size = 300
+model = dict(
+    type='SingleStageDetector',
+    backbone=dict(
+        type='SSDVGG',
+        depth=16,
+        with_last_pool=False,
+        ceil_mode=True,
+        out_indices=(3, 4),
+        out_feature_indices=(22, 34),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')),
+    neck=dict(
+        type='SSDNeck',
+        in_channels=(512, 1024),
+        out_channels=(512, 1024, 512, 256, 256, 256),
+        level_strides=(2, 2, 1, 1),
+        level_paddings=(1, 1, 0, 0),
+        l2_norm_scale=20),
+    bbox_head=dict(
+        type='AscendSSDHead',
+        in_channels=(512, 1024, 512, 256, 256, 256),
+        num_classes=80,
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='AscendMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        nms=dict(type='nms', iou_threshold=0.45),
+        min_bbox_size=0,
+        score_thr=0.02,
+        max_per_img=200))
+cudnn_benchmark = True
diff --git a/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100755
index 0000000..2902cca
--- /dev/null
+++ b/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,196 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/configs/_base_/models/cascade_rcnn_r50_fpn.py b/configs/_base_/models/cascade_rcnn_r50_fpn.py
new file mode 100755
index 0000000..42f74ae
--- /dev/null
+++ b/configs/_base_/models/cascade_rcnn_r50_fpn.py
@@ -0,0 +1,179 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/configs/_base_/models/fast_rcnn_r50_fpn.py b/configs/_base_/models/fast_rcnn_r50_fpn.py
new file mode 100755
index 0000000..9982fe0
--- /dev/null
+++ b/configs/_base_/models/fast_rcnn_r50_fpn.py
@@ -0,0 +1,62 @@
+# model settings
+model = dict(
+    type='FastRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/configs/_base_/models/faster_rcnn_r50_caffe_c4.py b/configs/_base_/models/faster_rcnn_r50_caffe_c4.py
new file mode 100755
index 0000000..dbf965a
--- /dev/null
+++ b/configs/_base_/models/faster_rcnn_r50_caffe_c4.py
@@ -0,0 +1,117 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        shared_head=dict(
+            type='ResLayer',
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True,
+            init_cfg=dict(
+                type='Pretrained',
+                checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='BBoxHead',
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=6000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py b/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py
new file mode 100755
index 0000000..a377a6f
--- /dev/null
+++ b/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py
@@ -0,0 +1,105 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        strides=(1, 2, 2, 1),
+        dilations=(1, 1, 1, 2),
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=2048,
+        feat_channels=2048,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=2048,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=2048,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms=dict(type='nms', iou_threshold=0.7),
+            nms_pre=6000,
+            max_per_img=1000,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/configs/_base_/models/faster_rcnn_r50_fpn.py b/configs/_base_/models/faster_rcnn_r50_fpn.py
new file mode 100755
index 0000000..1ef8e7b
--- /dev/null
+++ b/configs/_base_/models/faster_rcnn_r50_fpn.py
@@ -0,0 +1,108 @@
+# model settings
+model = dict(
+    type='FasterRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
diff --git a/configs/_base_/models/mask_rcnn_r50_caffe_c4.py b/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
new file mode 100755
index 0000000..122202e
--- /dev/null
+++ b/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
@@ -0,0 +1,125 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='MaskRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        shared_head=dict(
+            type='ResLayer',
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True),
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='BBoxHead',
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=None,
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=0,
+            in_channels=2048,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=14,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=6000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            max_per_img=1000,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/configs/_base_/models/mask_rcnn_r50_fpn.py b/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100755
index 0000000..d903e55
--- /dev/null
+++ b/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,120 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/configs/_base_/models/retinanet_r50_fpn.py b/configs/_base_/models/retinanet_r50_fpn.py
new file mode 100755
index 0000000..56e43fa
--- /dev/null
+++ b/configs/_base_/models/retinanet_r50_fpn.py
@@ -0,0 +1,60 @@
+# model settings
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type='RetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/configs/_base_/models/rpn_r50_caffe_c4.py b/configs/_base_/models/rpn_r50_caffe_c4.py
new file mode 100755
index 0000000..8b32ca9
--- /dev/null
+++ b/configs/_base_/models/rpn_r50_caffe_c4.py
@@ -0,0 +1,58 @@
+# model settings
+model = dict(
+    type='RPN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=None,
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/configs/_base_/models/rpn_r50_fpn.py b/configs/_base_/models/rpn_r50_fpn.py
new file mode 100755
index 0000000..edaf4d4
--- /dev/null
+++ b/configs/_base_/models/rpn_r50_fpn.py
@@ -0,0 +1,58 @@
+# model settings
+model = dict(
+    type='RPN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/configs/_base_/models/ssd300.py b/configs/_base_/models/ssd300.py
new file mode 100755
index 0000000..f17df01
--- /dev/null
+++ b/configs/_base_/models/ssd300.py
@@ -0,0 +1,56 @@
+# model settings
+input_size = 300
+model = dict(
+    type='SingleStageDetector',
+    backbone=dict(
+        type='SSDVGG',
+        depth=16,
+        with_last_pool=False,
+        ceil_mode=True,
+        out_indices=(3, 4),
+        out_feature_indices=(22, 34),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')),
+    neck=dict(
+        type='SSDNeck',
+        in_channels=(512, 1024),
+        out_channels=(512, 1024, 512, 256, 256, 256),
+        level_strides=(2, 2, 1, 1),
+        level_paddings=(1, 1, 0, 0),
+        l2_norm_scale=20),
+    bbox_head=dict(
+        type='SSDHead',
+        in_channels=(512, 1024, 512, 256, 256, 256),
+        num_classes=80,
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        nms=dict(type='nms', iou_threshold=0.45),
+        min_bbox_size=0,
+        score_thr=0.02,
+        max_per_img=200))
+cudnn_benchmark = True
diff --git a/configs/_base_/schedules/schedule_1x.py b/configs/_base_/schedules/schedule_1x.py
new file mode 100755
index 0000000..13b3783
--- /dev/null
+++ b/configs/_base_/schedules/schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/_base_/schedules/schedule_1x_trafficsign.py b/configs/_base_/schedules/schedule_1x_trafficsign.py
new file mode 100755
index 0000000..f1d470b
--- /dev/null
+++ b/configs/_base_/schedules/schedule_1x_trafficsign.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[2, 6])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/_base_/schedules/schedule_20e.py b/configs/_base_/schedules/schedule_20e.py
new file mode 100755
index 0000000..00e8590
--- /dev/null
+++ b/configs/_base_/schedules/schedule_20e.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/_base_/schedules/schedule_2x.py b/configs/_base_/schedules/schedule_2x.py
new file mode 100755
index 0000000..69dc9ee
--- /dev/null
+++ b/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/albu_example/README.md b/configs/albu_example/README.md
new file mode 100755
index 0000000..9a180f0
--- /dev/null
+++ b/configs/albu_example/README.md
@@ -0,0 +1,31 @@
+# Albu Example
+
+> [Albumentations: fast and flexible image augmentations](https://arxiv.org/abs/1809.06839)
+
+<!-- [OTHERS] -->
+
+## Abstract
+
+Data augmentation is a commonly used technique for increasing both the size and the diversity of labeled training sets by leveraging input transformations that preserve output labels. In computer vision domain, image augmentations have become a common implicit regularization technique to combat overfitting in deep convolutional neural networks and are ubiquitously used to improve performance. While most deep learning frameworks implement basic image transformations, the list is typically limited to some variations and combinations of flipping, rotating, scaling, and cropping. Moreover, the image processing speed varies in existing tools for image augmentation. We present Albumentations, a fast and flexible library for image augmentations with many various image transform operations available, that is also an easy-to-use wrapper around other augmentation libraries. We provide examples of image augmentations for different computer vision tasks and show that Albumentations is faster than other commonly used image augmentation tools on the most of commonly used image transformations.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143870703-74f3ea3f-ae23-4035-9856-746bc3f88464.png" height="400" />
+</div>
+
+## Results and Models
+
+| Backbone |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                         Config                                                         |                                                                                                                                                        Download                                                                                                                                                         |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |   1x    |   4.4    |      16.6      |  38.0  |  34.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208-ab203bcd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208_225520.log.json) |
+
+## Citation
+
+```latex
+@article{2018arXiv180906839B,
+  author = {A. Buslaev, A. Parinov, E. Khvedchenya, V.~I. Iglovikov and A.~A. Kalinin},
+  title = "{Albumentations: fast and flexible image augmentations}",
+  journal = {ArXiv e-prints},
+  eprint = {1809.06839},
+  year = 2018
+}
+```
diff --git a/configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py b/configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py
new file mode 100755
index 0000000..b3f879a
--- /dev/null
+++ b/configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py
@@ -0,0 +1,73 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+albu_train_transforms = [
+    dict(
+        type='ShiftScaleRotate',
+        shift_limit=0.0625,
+        scale_limit=0.0,
+        rotate_limit=0,
+        interpolation=1,
+        p=0.5),
+    dict(
+        type='RandomBrightnessContrast',
+        brightness_limit=[0.1, 0.3],
+        contrast_limit=[0.1, 0.3],
+        p=0.2),
+    dict(
+        type='OneOf',
+        transforms=[
+            dict(
+                type='RGBShift',
+                r_shift_limit=10,
+                g_shift_limit=10,
+                b_shift_limit=10,
+                p=1.0),
+            dict(
+                type='HueSaturationValue',
+                hue_shift_limit=20,
+                sat_shift_limit=30,
+                val_shift_limit=20,
+                p=1.0)
+        ],
+        p=0.1),
+    dict(type='JpegCompression', quality_lower=85, quality_upper=95, p=0.2),
+    dict(type='ChannelShuffle', p=0.1),
+    dict(
+        type='OneOf',
+        transforms=[
+            dict(type='Blur', blur_limit=3, p=1.0),
+            dict(type='MedianBlur', blur_limit=3, p=1.0)
+        ],
+        p=0.1),
+]
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='Pad', size_divisor=32),
+    dict(
+        type='Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_labels'],
+            min_visibility=0.0,
+            filter_lost_elements=True),
+        keymap={
+            'img': 'image',
+            'gt_masks': 'masks',
+            'gt_bboxes': 'bboxes'
+        },
+        update_pad_shape=False,
+        skip_img_without_anno=True),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'],
+        meta_keys=('filename', 'ori_shape', 'img_shape', 'img_norm_cfg',
+                   'pad_shape', 'scale_factor'))
+]
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/atss/README.md b/configs/atss/README.md
new file mode 100755
index 0000000..055ed05
--- /dev/null
+++ b/configs/atss/README.md
@@ -0,0 +1,31 @@
+# ATSS
+
+> [Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection](https://arxiv.org/abs/1912.02424)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Object detection has been dominated by anchor-based detectors for several years. Recently, anchor-free detectors have become popular due to the proposal of FPN and Focal Loss. In this paper, we first point out that the essential difference between anchor-based and anchor-free detection is actually how to define positive and negative training samples, which leads to the performance gap between them. If they adopt the same definition of positive and negative samples during training, there is no obvious difference in the final performance, no matter regressing from a box or a point. This shows that how to select positive and negative training samples is important for current object detectors. Then, we propose an Adaptive Training Sample Selection (ATSS) to automatically select positive and negative samples according to statistical characteristics of object. It significantly improves the performance of anchor-based and anchor-free detectors and bridges the gap between them. Finally, we discuss the necessity of tiling multiple anchors per location on the image to detect objects. Extensive experiments conducted on MS COCO support our aforementioned analysis and conclusions. With the newly introduced ATSS, we improve state-of-the-art detectors by a large margin to 50.7% AP without introducing any overhead.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143870776-c81168f5-e8b2-44ee-978b-509e4372c5c9.png"/>
+</div>
+
+## Results and Models
+
+| Backbone |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                Config                                                 |                                                                                                                            Download                                                                                                                             |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |   1x    |   3.7    |      19.7      |  39.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/atss/atss_r50_fpn_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209_102539.log.json) |
+|  R-101   | pytorch |   1x    |   5.6    |      12.3      |  41.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/atss/atss_r101_fpn_1x_coco.py) |   [model](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.log.json)   |
+
+## Citation
+
+```latex
+@article{zhang2019bridging,
+  title   =  {Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection},
+  author  =  {Zhang, Shifeng and Chi, Cheng and Yao, Yongqiang and Lei, Zhen and Li, Stan Z.},
+  journal =  {arXiv preprint arXiv:1912.02424},
+  year    =  {2019}
+}
+```
diff --git a/configs/atss/atss_r101_fpn_1x_coco.py b/configs/atss/atss_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..5225d2a
--- /dev/null
+++ b/configs/atss/atss_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './atss_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/atss/atss_r50_fpn_1x_coco.py b/configs/atss/atss_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..42ff4c5
--- /dev/null
+++ b/configs/atss/atss_r50_fpn_1x_coco.py
@@ -0,0 +1,62 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ATSS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/atss/metafile.yml b/configs/atss/metafile.yml
new file mode 100755
index 0000000..f4c567e
--- /dev/null
+++ b/configs/atss/metafile.yml
@@ -0,0 +1,60 @@
+Collections:
+  - Name: ATSS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ATSS
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1912.02424
+      Title: 'Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection'
+    README: configs/atss/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/atss.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: atss_r50_fpn_1x_coco
+    In Collection: ATSS
+    Config: configs/atss/atss_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      inference time (ms/im):
+        - value: 50.76
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth
+
+  - Name: atss_r101_fpn_1x_coco
+    In Collection: ATSS
+    Config: configs/atss/atss_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      inference time (ms/im):
+        - value: 81.3
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth
diff --git a/configs/autoassign/README.md b/configs/autoassign/README.md
new file mode 100755
index 0000000..1297206
--- /dev/null
+++ b/configs/autoassign/README.md
@@ -0,0 +1,35 @@
+# AutoAssign
+
+> [AutoAssign: Differentiable Label Assignment for Dense Object Detection](https://arxiv.org/abs/2007.03496)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Determining positive/negative samples for object detection is known as label assignment. Here we present an anchor-free detector named AutoAssign. It requires little human knowledge and achieves appearance-aware through a fully differentiable weighting mechanism. During training, to both satisfy the prior distribution of data and adapt to category characteristics, we present Center Weighting to adjust the category-specific prior distributions. To adapt to object appearances, Confidence Weighting is proposed to adjust the specific assign strategy of each instance. The two weighting modules are then combined to generate positive and negative weights to adjust each location's confidence. Extensive experiments on the MS COCO show that our method steadily surpasses other best sampling strategies by large margins with various backbones. Moreover, our best model achieves 52.1% AP, outperforming all existing one-stage detectors. Besides, experiments on other datasets, e.g., PASCAL VOC, Objects365, and WiderFace, demonstrate the broad applicability of AutoAssign.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143870875-33567e44-0584-4470-9a90-0df0fb6c1fe2.png"/>
+</div>
+
+## Results and Models
+
+| Backbone | Style | Lr schd | Mem (GB) | box AP |                                                        Config                                                        |                                                                                                                                                        Download                                                                                                                                                         |
+| :------: | :---: | :-----: | :------: | :----: | :------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | caffe |   1x    |   4.08   |  40.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.log.json) |
+
+**Note**:
+
+1. We find that the performance is unstable with 1x setting and may fluctuate by about 0.3 mAP. mAP 40.3 ~ 40.6 is acceptable. Such fluctuation can also be found in the original implementation.
+2. You can get a more stable results ~ mAP 40.6 with a schedule total 13 epoch, and learning rate is divided by 10 at 10th and 13th epoch.
+
+## Citation
+
+```latex
+@article{zhu2020autoassign,
+  title={AutoAssign: Differentiable Label Assignment for Dense Object Detection},
+  author={Zhu, Benjin and Wang, Jianfeng and Jiang, Zhengkai and Zong, Fuhang and Liu, Songtao and Li, Zeming and Sun, Jian},
+  journal={arXiv preprint arXiv:2007.03496},
+  year={2020}
+}
+```
diff --git a/configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py b/configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py
new file mode 100755
index 0000000..db548dc
--- /dev/null
+++ b/configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py
@@ -0,0 +1,85 @@
+# We follow the original implementation which
+# adopts the Caffe pre-trained backbone.
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='AutoAssign',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5,
+        relu_before_extra_convs=True,
+        init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')),
+    bbox_head=dict(
+        type='AutoAssignHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        loss_bbox=dict(type='GIoULoss', loss_weight=5.0)),
+    train_cfg=None,
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(lr=0.01, paramwise_cfg=dict(norm_decay_mult=0.))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+total_epochs = 12
diff --git a/configs/autoassign/metafile.yml b/configs/autoassign/metafile.yml
new file mode 100755
index 0000000..f1e9051
--- /dev/null
+++ b/configs/autoassign/metafile.yml
@@ -0,0 +1,33 @@
+Collections:
+  - Name: AutoAssign
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - AutoAssign
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2007.03496
+      Title: 'AutoAssign: Differentiable Label Assignment for Dense Object Detection'
+    README: configs/autoassign/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/autoassign.py#L6
+      Version: v2.12.0
+
+Models:
+  - Name: autoassign_r50_fpn_8x2_1x_coco
+    In Collection: AutoAssign
+    Config: configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.08
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth
diff --git a/configs/carafe/README.md b/configs/carafe/README.md
new file mode 100755
index 0000000..803abe0
--- /dev/null
+++ b/configs/carafe/README.md
@@ -0,0 +1,42 @@
+# CARAFE
+
+> [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Feature upsampling is a key operation in a number of modern convolutional network architectures, e.g. feature pyramids. Its design is critical for dense prediction tasks such as object detection and semantic/instance segmentation. In this work, we propose Content-Aware ReAssembly of FEatures (CARAFE), a universal, lightweight and highly effective operator to fulfill this goal. CARAFE has several appealing properties: (1) Large field of view. Unlike previous works (e.g. bilinear interpolation) that only exploit sub-pixel neighborhood, CARAFE can aggregate contextual information within a large receptive field. (2) Content-aware handling. Instead of using a fixed kernel for all samples (e.g. deconvolution), CARAFE enables instance-specific content-aware handling, which generates adaptive kernels on-the-fly. (3) Lightweight and fast to compute. CARAFE introduces little computational overhead and can be readily integrated into modern network architectures. We conduct comprehensive evaluations on standard benchmarks in object detection, instance/semantic segmentation and inpainting. CARAFE shows consistent and substantial gains across all the tasks (1.2%, 1.3%, 1.8%, 1.1db respectively) with negligible computational overhead. It has great potential to serve as a strong building block for future research. It has great potential to serve as a strong building block for future research.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143872016-48225685-0e59-49cf-bd65-a50ee04ca8a2.png"/>
+</div>
+
+## Results and Models
+
+The results on COCO 2017 val is shown in the below table.
+
+|         Method         | Backbone |  Style  | Lr schd | Test Proposal Num | Inf time (fps) | Box AP | Mask AP |                                                        Config                                                        |                                                                                                                                                                         Download                                                                                                                                                                          |
+| :--------------------: | :------: | :-----: | :-----: | :---------------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Faster R-CNN w/ CARAFE | R-50-FPN | pytorch |   1x    |       1000        |      16.5      |  38.6  |  38.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py) |     [model](https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_20200504_175733.log.json)     |
+|           -            |    -     |    -    |    -    |       2000        |                |        |         |                                                                                                                      |                                                                                                                                                                                                                                                                                                                                                           |
+|  Mask R-CNN w/ CARAFE  | R-50-FPN | pytorch |   1x    |       1000        |      14.0      |  39.3  |  35.8   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_20200503_135957.log.json) |
+|           -            |    -     |    -    |    -    |       2000        |                |        |         |                                                                                                                      |                                                                                                                                                                                                                                                                                                                                                           |
+
+## Implementation
+
+The CUDA implementation of CARAFE can be find at https://github.com/myownskyW7/CARAFE.
+
+## Citation
+
+We provide config files to reproduce the object detection & instance segmentation results in the ICCV 2019 Oral paper for [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188).
+
+```latex
+@inproceedings{Wang_2019_ICCV,
+    title = {CARAFE: Content-Aware ReAssembly of FEatures},
+    author = {Wang, Jiaqi and Chen, Kai and Xu, Rui and Liu, Ziwei and Loy, Chen Change and Lin, Dahua},
+    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+    month = {October},
+    year = {2019}
+}
+```
diff --git a/configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py b/configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py
new file mode 100755
index 0000000..dedac3f
--- /dev/null
+++ b/configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py
@@ -0,0 +1,50 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    neck=dict(
+        type='FPN_CARAFE',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5,
+        start_level=0,
+        end_level=-1,
+        norm_cfg=None,
+        act_cfg=None,
+        order=('conv', 'norm', 'act'),
+        upsample_cfg=dict(
+            type='carafe',
+            up_kernel=5,
+            up_group=1,
+            encoder_kernel=3,
+            encoder_dilation=1,
+            compressed_channels=64)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=64),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=64),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py b/configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py
new file mode 100755
index 0000000..668c023
--- /dev/null
+++ b/configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py
@@ -0,0 +1,60 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    neck=dict(
+        type='FPN_CARAFE',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5,
+        start_level=0,
+        end_level=-1,
+        norm_cfg=None,
+        act_cfg=None,
+        order=('conv', 'norm', 'act'),
+        upsample_cfg=dict(
+            type='carafe',
+            up_kernel=5,
+            up_group=1,
+            encoder_kernel=3,
+            encoder_dilation=1,
+            compressed_channels=64)),
+    roi_head=dict(
+        mask_head=dict(
+            upsample_cfg=dict(
+                type='carafe',
+                scale_factor=2,
+                up_kernel=5,
+                up_group=1,
+                encoder_kernel=3,
+                encoder_dilation=1,
+                compressed_channels=64))))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=64),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=64),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/carafe/metafile.yml b/configs/carafe/metafile.yml
new file mode 100755
index 0000000..b58a3f6
--- /dev/null
+++ b/configs/carafe/metafile.yml
@@ -0,0 +1,55 @@
+Collections:
+  - Name: CARAFE
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RPN
+        - FPN_CARAFE
+        - ResNet
+        - RoIPool
+    Paper:
+      URL: https://arxiv.org/abs/1905.02188
+      Title: 'CARAFE: Content-Aware ReAssembly of FEatures'
+    README: configs/carafe/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/necks/fpn_carafe.py#L11
+      Version: v2.12.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_carafe_1x_coco
+    In Collection: CARAFE
+    Config: configs/carafe/faster_rcnn_r50_fpn_carafe_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.26
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth
+
+  - Name: mask_rcnn_r50_fpn_carafe_1x_coco
+    In Collection: CARAFE
+    Config: configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.31
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth
diff --git a/configs/cascade_rcnn/README.md b/configs/cascade_rcnn/README.md
new file mode 100755
index 0000000..5a9e817
--- /dev/null
+++ b/configs/cascade_rcnn/README.md
@@ -0,0 +1,79 @@
+# Cascade R-CNN
+
+> [Cascade R-CNN: High Quality Object Detection and Instance Segmentation](https://arxiv.org/abs/1906.09756)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In object detection, the intersection over union (IoU) threshold is frequently used to define positives/negatives. The threshold used to train a detector defines its quality. While the commonly used threshold of 0.5 leads to noisy (low-quality) detections, detection performance frequently degrades for larger thresholds. This paradox of high-quality detection has two causes: 1) overfitting, due to vanishing positive samples for large thresholds, and 2) inference-time quality mismatch between detector and test hypotheses. A multi-stage object detection architecture, the Cascade R-CNN, composed of a sequence of detectors trained with increasing IoU thresholds, is proposed to address these problems. The detectors are trained sequentially, using the output of a detector as training set for the next. This resampling progressively improves hypotheses quality, guaranteeing a positive training set of equivalent size for all detectors and minimizing overfitting. The same cascade is applied at inference, to eliminate quality mismatches between hypotheses and detectors. An implementation of the Cascade R-CNN without bells or whistles achieves state-of-the-art performance on the COCO dataset, and significantly improves high-quality detection on generic and specific object detection datasets, including VOC, KITTI, CityPerson, and WiderFace. Finally, the Cascade R-CNN is generalized to instance segmentation, with nontrivial improvements over the Mask R-CNN.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143872197-d99b90e4-4f05-4329-80a4-327ac862a051.png"/>
+</div>
+
+## Results and Models
+
+### Cascade R-CNN
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                            Config                                                            |                                                                                                                                                                             Download                                                                                                                                                                              |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     |  caffe  |   1x    |   4.2    |                |  40.4  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_20200504_174853.log.json)   |
+|    R-50-FPN     | pytorch |   1x    |   4.4    |      16.1      |  40.3  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py)     |                          [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316_214748.log.json)                          |
+|    R-50-FPN     | pytorch |   20e   |    -     |       -        |  41.0  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py)     |             [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_20200504_175131.log.json)              |
+|    R-101-FPN    |  caffe  |   1x    |   6.2    |                |  42.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_20200504_175649.log.json) |
+|    R-101-FPN    | pytorch |   1x    |   6.4    |      13.5      |  42.0  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py)     |                        [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317_101744.log.json)                        |
+|    R-101-FPN    | pytorch |   20e   |    -     |       -        |  42.5  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py)    |           [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_20200504_231812.log.json)           |
+| X-101-32x4d-FPN | pytorch |   1x    |   7.6    |      10.9      |  43.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py)  |            [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316_055608.log.json)            |
+| X-101-32x4d-FPN | pytorch |   20e   |   7.6    |                |  43.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608.log.json)       |
+| X-101-64x4d-FPN | pytorch |   1x    |   10.7   |                |  44.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py)  |        [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702.log.json)         |
+| X-101-64x4d-FPN | pytorch |   20e   |   10.7   |                |  44.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357.log.json)       |
+
+### Cascade Mask R-CNN
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                              Config                                                               |                                                                                                                                                                                               Download                                                                                                                                                                                                |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     |  caffe  |   1x    |   5.9    |                |  41.2  |  36.0   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_20200504_174659.log.json)    |
+|    R-50-FPN     | pytorch |   1x    |   6.0    |      11.2      |  41.2  |  35.9   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py)     |                                  [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203_170449.log.json)                                  |
+|    R-50-FPN     | pytorch |   20e   |    -     |       -        |  41.9  |  36.5   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py)     |             [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_20200504_174711.log.json)             |
+|    R-101-FPN    |  caffe  |   1x    |   7.8    |                |  43.2  |  37.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_20200504_174813.log.json) |
+|    R-101-FPN    | pytorch |   1x    |   7.9    |      9.8       |  42.9  |  37.3   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py)     |                                [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203_092521.log.json)                                |
+|    R-101-FPN    | pytorch |   20e   |    -     |       -        |  43.4  |  37.8   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py)    |           [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_20200504_174836.log.json)           |
+| X-101-32x4d-FPN | pytorch |   1x    |   9.2    |      8.6       |  44.3  |  38.3   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py)  |                    [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201_052416.log.json)                    |
+| X-101-32x4d-FPN | pytorch |   20e   |   9.2    |       -        |  45.0  |  39.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py) |              [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917.log.json)               |
+| X-101-64x4d-FPN | pytorch |   1x    |   12.2   |      6.7       |  45.3  |  39.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py)  |                    [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203_044059.log.json)                    |
+| X-101-64x4d-FPN | pytorch |   20e   |   12.2   |                |  45.6  |  39.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py) |              [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033.log.json)               |
+
+**Notes:**
+
+- The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs.
+
+## Pre-trained Models
+
+We also train some models with longer schedules and multi-scale training for Cascade Mask R-CNN. The users could finetune them for downstream tasks.
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                  Config                                                                  |                                                                                                                                                                                                Download                                                                                                                                                                                                |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     |  caffe  |   3x    |   5.7    |                |  44.0  |  38.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651-6e29b3a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651.log.json)   |
+|    R-50-FPN     | pytorch |   3x    |   5.9    |                |  44.3  |  38.5   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719-5bdc3824.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719.log.json)               |
+|    R-101-FPN    |  caffe  |   3x    |   7.7    |                |  45.4  |  39.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620-a5bd2389.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620.log.json) |
+|    R-101-FPN    | pytorch |   3x    |   7.8    |                |  45.5  |  39.6   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236-51a2d363.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236.log.json)             |
+| X-101-32x4d-FPN | pytorch |   3x    |   9.0    |                |  46.3  |  40.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234-40773067.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234.log.json) |
+| X-101-32x8d-FPN | pytorch |   3x    |   12.1   |                |  46.1  |  39.9   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640-9ff7e76f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640.log.json) |
+| X-101-64x4d-FPN | pytorch |   3x    |   12.0   |                |  46.6  |  40.3   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311-d3e64ba0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311.log.json) |
+
+## Citation
+
+```latex
+@article{Cai_2019,
+   title={Cascade R-CNN: High Quality Object Detection and Instance Segmentation},
+   ISSN={1939-3539},
+   url={http://dx.doi.org/10.1109/tpami.2019.2956516},
+   DOI={10.1109/tpami.2019.2956516},
+   journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Cai, Zhaowei and Vasconcelos, Nuno},
+   year={2019},
+   pages={1–1}
+}
+```
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..5ee6231
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..1df87fc
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..f59c155
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py
new file mode 100755
index 0000000..45ab7ed
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..1b20f16
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..12d37ef
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,41 @@
+_base_ = ['./cascade_mask_rcnn_r50_fpn_1x_coco.py']
+
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..9fb817e
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py
@@ -0,0 +1,49 @@
+_base_ = ['./cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py']
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..49ab539
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py
new file mode 100755
index 0000000..1296dc4
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_20e.py', '../_base_/default_runtime.py'
+]
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..ed0c6d1
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../common/mstrain_3x_coco_instance.py',
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py'
+]
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..06cbbe7
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py
new file mode 100755
index 0000000..4e35236
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..7d37d17
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..eeec1aa
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py
@@ -0,0 +1,60 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+# ResNeXt-101-32x8d model trained with Caffe2 at FB,
+# so the mean and std need to be changed.
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675],
+    std=[57.375, 57.120, 58.395],
+    to_rgb=False)
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..7dbef5f
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py
new file mode 100755
index 0000000..579b1ac
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py b/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..ed6cf4b
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..1e90f4b
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..5c07776
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py
new file mode 100755
index 0000000..b1719c2
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..696bcfb
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,42 @@
+_base_ = './cascade_rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..87e21fb
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/cascade_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py
new file mode 100755
index 0000000..6f886e1
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './cascade_rcnn_r50_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..5ac02c1
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py
new file mode 100755
index 0000000..486e45e
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..78229f0
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './cascade_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py b/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py
new file mode 100755
index 0000000..58812de
--- /dev/null
+++ b/configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py
@@ -0,0 +1,15 @@
+_base_ = './cascade_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/cascade_rcnn/metafile.yml b/configs/cascade_rcnn/metafile.yml
new file mode 100755
index 0000000..6586325
--- /dev/null
+++ b/configs/cascade_rcnn/metafile.yml
@@ -0,0 +1,545 @@
+Collections:
+  - Name: Cascade R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Cascade R-CNN
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: http://dx.doi.org/10.1109/tpami.2019.2956516
+      Title: 'Cascade R-CNN: Delving into High Quality Object Detection'
+    README: configs/cascade_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/cascade_rcnn.py#L6
+      Version: v2.0.0
+  - Name: Cascade Mask R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Cascade R-CNN
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: http://dx.doi.org/10.1109/tpami.2019.2956516
+      Title: 'Cascade R-CNN: Delving into High Quality Object Detection'
+    README: configs/cascade_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/cascade_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: cascade_rcnn_r50_caffe_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth
+
+  - Name: cascade_rcnn_r50_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth
+
+  - Name: cascade_rcnn_r50_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth
+
+  - Name: cascade_rcnn_r101_caffe_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth
+
+  - Name: cascade_rcnn_r101_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth
+
+  - Name: cascade_rcnn_r101_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth
+
+  - Name: cascade_rcnn_x101_32x4d_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 91.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth
+
+  - Name: cascade_rcnn_x101_32x4d_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth
+
+  - Name: cascade_rcnn_x101_64x4d_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth
+
+  - Name: cascade_rcnn_x101_64x4d_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth
+
+  - Name: cascade_mask_rcnn_r50_caffe_fpn_1x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth
+
+  - Name: cascade_mask_rcnn_r50_fpn_1x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 89.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth
+
+  - Name: cascade_mask_rcnn_r50_fpn_20e_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 89.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth
+
+  - Name: cascade_mask_rcnn_r101_caffe_fpn_1x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth
+
+  - Name: cascade_mask_rcnn_r101_fpn_1x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      inference time (ms/im):
+        - value: 102.04
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth
+
+  - Name: cascade_mask_rcnn_r101_fpn_20e_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      inference time (ms/im):
+        - value: 102.04
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_1x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_20e_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth
+
+  - Name: cascade_mask_rcnn_x101_64x4d_fpn_1x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      inference time (ms/im):
+        - value: 149.25
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth
+
+  - Name: cascade_mask_rcnn_x101_64x4d_fpn_20e_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth
+
+  - Name: cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651-6e29b3a6.pth
+
+  - Name: cascade_mask_rcnn_r50_fpn_mstrain_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719-5bdc3824.pth
+
+  - Name: cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620-a5bd2389.pth
+
+  - Name: cascade_mask_rcnn_r101_fpn_mstrain_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236-51a2d363.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234-40773067.pth
+
+  - Name: cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 12.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640-9ff7e76f.pth
+
+  - Name: cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 12.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311-d3e64ba0.pth
diff --git a/configs/cascade_rpn/README.md b/configs/cascade_rpn/README.md
new file mode 100755
index 0000000..fb2b482
--- /dev/null
+++ b/configs/cascade_rpn/README.md
@@ -0,0 +1,41 @@
+# Cascade RPN
+
+> [Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution](https://arxiv.org/abs/1909.06720)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+This paper considers an architecture referred to as Cascade Region Proposal Network (Cascade RPN) for improving the region-proposal quality and detection performance by systematically addressing the limitation of the conventional RPN that heuristically defines the anchors and aligns the features to the anchors. First, instead of using multiple anchors with predefined scales and aspect ratios, Cascade RPN relies on a single anchor per location and performs multi-stage refinement. Each stage is progressively more stringent in defining positive samples by starting out with an anchor-free metric followed by anchor-based metrics in the ensuing stages. Second, to attain alignment between the features and the anchors throughout the stages, adaptive convolution is proposed that takes the anchors in addition to the image features as its input and learns the sampled features guided by the anchors. A simple implementation of a two-stage Cascade RPN achieves AR 13.4 points higher than that of the conventional RPN, surpassing any existing region proposal methods. When adopting to Fast R-CNN and Faster R-CNN, Cascade RPN can improve the detection mAP by 3.1 and 3.5 points, respectively.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143872368-1580193a-d19c-4723-a579-c7ed2d5da4d1.png"/>
+</div>
+
+## Results and Models
+
+### Region proposal performance
+
+| Method | Backbone | Style | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR 1000 |                                                      Config                                                       |                                                                    Download                                                                    |
+| :----: | :------: | :---: | :------: | :-----------------: | :------------: | :-----: | :---------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------: |
+|  CRPN  | R-50-FPN | caffe |    -     |          -          |       -        |  72.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_r50_caffe_fpn_1x_coco/cascade_rpn_r50_caffe_fpn_1x_coco-7aa93cef.pth) |
+
+### Detection performance
+
+|    Method    |  Proposal   | Backbone | Style | Schedule | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP |                                                            Config                                                             |                                                                            Download                                                                             |
+| :----------: | :---------: | :------: | :---: | :------: | :------: | :-----------------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Fast R-CNN  | Cascade RPN | R-50-FPN | caffe |    1x    |    -     |          -          |       -        |  39.9  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco/crpn_fast_rcnn_r50_caffe_fpn_1x_coco-cb486e66.pth)   |
+| Faster R-CNN | Cascade RPN | R-50-FPN | caffe |    1x    |    -     |          -          |       -        |  40.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth) |
+
+## Citation
+
+We provide the code for reproducing experiment results of [Cascade RPN](https://arxiv.org/abs/1909.06720).
+
+```latex
+@inproceedings{vu2019cascade,
+  title={Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution},
+  author={Vu, Thang and Jang, Hyunjun and Pham, Trung X and Yoo, Chang D},
+  booktitle={Conference on Neural Information Processing Systems (NeurIPS)},
+  year={2019}
+}
+```
diff --git a/configs/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py b/configs/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..29f5d07
--- /dev/null
+++ b/configs/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,77 @@
+_base_ = '../fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    roi_head=dict(
+        bbox_head=dict(
+            bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65),
+            sampler=dict(num=256))),
+    test_cfg=dict(rcnn=dict(score_thr=1e-3)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=300),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=300),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='ToTensor', keys=['proposals']),
+            dict(
+                type='ToDataContainer',
+                fields=[dict(key='proposals', stack=False)]),
+            dict(type='Collect', keys=['img', 'proposals']),
+        ])
+]
+data = dict(
+    train=dict(
+        proposal_file=data_root +
+        'proposals/crpn_r50_caffe_fpn_1x_train2017.pkl',
+        pipeline=train_pipeline),
+    val=dict(
+        proposal_file=data_root +
+        'proposals/crpn_r50_caffe_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline),
+    test=dict(
+        proposal_file=data_root +
+        'proposals/crpn_r50_caffe_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py b/configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..bad86e6
--- /dev/null
+++ b/configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,92 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py'
+rpn_weight = 0.7
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='CascadeRPNHead',
+        num_stages=2,
+        stages=[
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                anchor_generator=dict(
+                    type='AnchorGenerator',
+                    scales=[8],
+                    ratios=[1.0],
+                    strides=[4, 8, 16, 32, 64]),
+                adapt_cfg=dict(type='dilation', dilation=3),
+                bridged_feature=True,
+                sampling=False,
+                with_cls=False,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.1, 0.1, 0.5, 0.5)),
+                loss_bbox=dict(
+                    type='IoULoss', linear=True,
+                    loss_weight=10.0 * rpn_weight)),
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                adapt_cfg=dict(type='offset'),
+                bridged_feature=False,
+                sampling=True,
+                with_cls=True,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.05, 0.05, 0.1, 0.1)),
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=True,
+                    loss_weight=1.0 * rpn_weight),
+                loss_bbox=dict(
+                    type='IoULoss', linear=True,
+                    loss_weight=10.0 * rpn_weight))
+        ]),
+    roi_head=dict(
+        bbox_head=dict(
+            bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=[
+            dict(
+                assigner=dict(
+                    type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5),
+                allowed_border=-1,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=256,
+                    pos_fraction=0.5,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=False),
+                allowed_border=-1,
+                pos_weight=-1,
+                debug=False)
+        ],
+        rpn_proposal=dict(max_per_img=300, nms=dict(iou_threshold=0.8)),
+        rcnn=dict(
+            assigner=dict(
+                pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65),
+            sampler=dict(type='RandomSampler', num=256))),
+    test_cfg=dict(
+        rpn=dict(max_per_img=300, nms=dict(iou_threshold=0.8)),
+        rcnn=dict(score_thr=1e-3)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py b/configs/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..5562e69
--- /dev/null
+++ b/configs/cascade_rpn/crpn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,77 @@
+_base_ = '../rpn/rpn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='CascadeRPNHead',
+        num_stages=2,
+        stages=[
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                anchor_generator=dict(
+                    type='AnchorGenerator',
+                    scales=[8],
+                    ratios=[1.0],
+                    strides=[4, 8, 16, 32, 64]),
+                adapt_cfg=dict(type='dilation', dilation=3),
+                bridged_feature=True,
+                sampling=False,
+                with_cls=False,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.1, 0.1, 0.5, 0.5)),
+                loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0)),
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                adapt_cfg=dict(type='offset'),
+                bridged_feature=False,
+                sampling=True,
+                with_cls=True,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.05, 0.05, 0.1, 0.1)),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=True,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0))
+        ]),
+    train_cfg=dict(rpn=[
+        dict(
+            assigner=dict(
+                type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.7,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1,
+                iou_calculator=dict(type='BboxOverlaps2D')),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False)
+    ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.8),
+            min_bbox_size=0)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/cascade_rpn/metafile.yml b/configs/cascade_rpn/metafile.yml
new file mode 100755
index 0000000..335b2bc
--- /dev/null
+++ b/configs/cascade_rpn/metafile.yml
@@ -0,0 +1,44 @@
+Collections:
+  - Name: Cascade RPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Cascade RPN
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1909.06720
+      Title: 'Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution'
+    README: configs/cascade_rpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/dense_heads/cascade_rpn_head.py#L538
+      Version: v2.8.0
+
+Models:
+  - Name: crpn_fast_rcnn_r50_caffe_fpn_1x_coco
+    In Collection: Cascade RPN
+    Config: configs/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco/crpn_fast_rcnn_r50_caffe_fpn_1x_coco-cb486e66.pth
+
+  - Name: crpn_faster_rcnn_r50_caffe_fpn_1x_coco
+    In Collection: Cascade RPN
+    Config: configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth
diff --git a/configs/centernet/README.md b/configs/centernet/README.md
new file mode 100755
index 0000000..0f951a0
--- /dev/null
+++ b/configs/centernet/README.md
@@ -0,0 +1,40 @@
+# CenterNet
+
+> [Objects as Points](https://arxiv.org/abs/1904.07850)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Detection identifies objects as axis-aligned boxes in an image. Most successful object detectors enumerate a nearly exhaustive list of potential object locations and classify each. This is wasteful, inefficient, and requires additional post-processing. In this paper, we take a different approach. We model an object as a single point --- the center point of its bounding box. Our detector uses keypoint estimation to find center points and regresses to all other object properties, such as size, 3D location, orientation, and even pose. Our center point based approach, CenterNet, is end-to-end differentiable, simpler, faster, and more accurate than corresponding bounding box based detectors. CenterNet achieves the best speed-accuracy trade-off on the MS COCO dataset, with 28.1% AP at 142 FPS, 37.4% AP at 52 FPS, and 45.1% AP with multi-scale testing at 1.4 FPS. We use the same approach to estimate 3D bounding box in the KITTI benchmark and human pose on the COCO keypoint dataset. Our method performs competitively with sophisticated multi-stage methods and runs in real-time.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143873810-85ffa6e7-915b-46a4-9b8f-709e5d7700bb.png"/>
+</div>
+
+## Results and Models
+
+| Backbone  | DCN | Mem (GB) | Box AP | Flip box AP |                                                         Config                                                          |                                                                                                                                                                 Download                                                                                                                                                                 |
+| :-------: | :-: | :------: | :----: | :---------: | :---------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ResNet-18 |  N  |   3.45   |  25.9  |    27.3     |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/centernet/centernet_resnet18_140e_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630-bb5b3bf7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630.log.json)             |
+| ResNet-18 |  Y  |   3.47   |  29.5  |    30.9     | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/centernet/centernet_resnet18_dcnv2_140e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131.log.json) |
+
+Note:
+
+- Flip box AP setting is single-scale and `flip=True`.
+- Due to complex data enhancement, we find that the performance is unstable and may fluctuate by about 0.4 mAP. mAP 29.4 ~ 29.8 is acceptable in ResNet-18-DCNv2.
+- Compared to the source code, we refer to [CenterNet-Better](https://github.com/FateScript/CenterNet-better), and make the following changes
+  - fix wrong image mean and variance in image normalization to be compatible with the pre-trained backbone.
+  - Use SGD rather than ADAM optimizer and add warmup and grad clip.
+  - Use DistributedDataParallel as other models in MMDetection rather than using DataParallel.
+
+## Citation
+
+```latex
+@article{zhou2019objects,
+  title={Objects as Points},
+  author={Zhou, Xingyi and Wang, Dequan and Kr{\"a}henb{\"u}hl, Philipp},
+  booktitle={arXiv preprint arXiv:1904.07850},
+  year={2019}
+}
+```
diff --git a/configs/centernet/centernet_resnet18_140e_coco.py b/configs/centernet/centernet_resnet18_140e_coco.py
new file mode 100755
index 0000000..52c86a5
--- /dev/null
+++ b/configs/centernet/centernet_resnet18_140e_coco.py
@@ -0,0 +1,3 @@
+_base_ = './centernet_resnet18_dcnv2_140e_coco.py'
+
+model = dict(neck=dict(use_dcn=False))
diff --git a/configs/centernet/centernet_resnet18_dcnv2_140e_coco.py b/configs/centernet/centernet_resnet18_dcnv2_140e_coco.py
new file mode 100755
index 0000000..b8a0bb1
--- /dev/null
+++ b/configs/centernet/centernet_resnet18_dcnv2_140e_coco.py
@@ -0,0 +1,127 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='CenterNet',
+    backbone=dict(
+        type='ResNet',
+        depth=18,
+        norm_eval=False,
+        norm_cfg=dict(type='BN'),
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(
+        type='CTResNetNeck',
+        in_channel=512,
+        num_deconv_filters=(256, 128, 64),
+        num_deconv_kernels=(4, 4, 4),
+        use_dcn=True),
+    bbox_head=dict(
+        type='CenterNetHead',
+        num_classes=80,
+        in_channel=64,
+        feat_channel=64,
+        loss_center_heatmap=dict(type='GaussianFocalLoss', loss_weight=1.0),
+        loss_wh=dict(type='L1Loss', loss_weight=0.1),
+        loss_offset=dict(type='L1Loss', loss_weight=1.0)),
+    train_cfg=None,
+    test_cfg=dict(topk=100, local_maximum_kernel=3, max_per_img=100))
+
+# We fixed the incorrect img_norm_cfg problem in the source code.
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True, color_type='color'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='RandomCenterCropPad',
+        crop_size=(512, 512),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        mean=[0, 0, 0],
+        std=[1, 1, 1],
+        to_rgb=True,
+        test_pad_mode=None),
+    dict(type='Resize', img_scale=(512, 512), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(
+                type='RandomCenterCropPad',
+                ratios=None,
+                border=None,
+                mean=[0, 0, 0],
+                std=[1, 1, 1],
+                to_rgb=True,
+                test_mode=True,
+                test_pad_mode=['logical_or', 31],
+                test_pad_add_pix=1),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                meta_keys=('filename', 'ori_filename', 'ori_shape',
+                           'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                           'flip_direction', 'img_norm_cfg', 'border'),
+                keys=['img'])
+        ])
+]
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Use RepeatDataset to speed up training
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# optimizer
+# Based on the default settings of modern detectors, the SGD effect is better
+# than the Adam in the source code, so we use SGD default settings and
+# if you use adam+lr5e-4, the map is 29.1.
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+
+# learning policy
+# Based on the default settings of modern detectors, we added warmup settings.
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[18, 24])  # the real step is [18*5, 24*5]
+runner = dict(max_epochs=28)  # the real epoch is 28*5=140
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/centernet/metafile.yml b/configs/centernet/metafile.yml
new file mode 100755
index 0000000..e86e57b
--- /dev/null
+++ b/configs/centernet/metafile.yml
@@ -0,0 +1,46 @@
+Collections:
+  - Name: CenterNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x TITANXP GPUs
+      Architecture:
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.07850
+      Title: 'Objects as Points'
+    README: configs/centernet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.13.0/mmdet/models/detectors/centernet.py#L10
+      Version: v2.13.0
+
+Models:
+  - Name: centernet_resnet18_dcnv2_140e_coco
+    In Collection: CenterNet
+    Config: configs/centernet/centernet_resnet18_dcnv2_140e_coco.py
+    Metadata:
+      Batch Size: 128
+      Training Memory (GB): 3.47
+      Epochs: 140
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 29.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth
+
+  - Name: centernet_resnet18_140e_coco
+    In Collection: CenterNet
+    Config: configs/centernet/centernet_resnet18_140e_coco.py
+    Metadata:
+      Batch Size: 128
+      Training Memory (GB): 3.45
+      Epochs: 140
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 25.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630-bb5b3bf7.pth
diff --git a/configs/centripetalnet/README.md b/configs/centripetalnet/README.md
new file mode 100755
index 0000000..b01b00a
--- /dev/null
+++ b/configs/centripetalnet/README.md
@@ -0,0 +1,36 @@
+# CentripetalNet
+
+> [CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection](https://arxiv.org/abs/2003.09119)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Keypoint-based detectors have achieved pretty-well performance. However, incorrect keypoint matching is still widespread and greatly affects the performance of the detector. In this paper, we propose CentripetalNet which uses centripetal shift to pair corner keypoints from the same instance. CentripetalNet predicts the position and the centripetal shift of the corner points and matches corners whose shifted results are aligned. Combining position information, our approach matches corner points more accurately than the conventional embedding approaches do. Corner pooling extracts information inside the bounding boxes onto the border. To make this information more aware at the corners, we design a cross-star deformable convolution network to conduct feature adaption. Furthermore, we explore instance segmentation on anchor-free detectors by equipping our CentripetalNet with a mask prediction module. On MS-COCO test-dev, our CentripetalNet not only outperforms all existing anchor-free detectors with an AP of 48.0% but also achieves comparable performance to the state-of-the-art instance segmentation approaches with a 40.2% MaskAP.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143873955-42804e0e-3638-4c5b-8bf4-ac8133bbcdc8.png"/>
+</div>
+
+## Results and Models
+
+|     Backbone     |                            Batch Size                            | Step/Total Epochs | Mem (GB) | Inf time (fps) | box AP |                                                                   Config                                                                    |                                                                                                                                                                                                    Download                                                                                                                                                                                                    |
+| :--------------: | :--------------------------------------------------------------: | :---------------: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HourglassNet-104 | [16 x 6](./centripetalnet_hourglass104_mstest_16x6_210e_coco.py) |      190/210      |   16.7   |      3.7       |  44.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804.log.json) |
+
+Note:
+
+- TTA setting is single-scale and `flip=True`.
+- The model we released is the best checkpoint rather than the latest checkpoint (box AP 44.8 vs 44.6 in our experiment).
+
+## Citation
+
+```latex
+@InProceedings{Dong_2020_CVPR,
+author = {Dong, Zhiwei and Li, Guoxuan and Liao, Yue and Wang, Fei and Ren, Pengju and Qian, Chen},
+title = {CentripetalNet: Pursuing High-Quality Keypoint Pairs for Object Detection},
+booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {June},
+year = {2020}
+}
+```
diff --git a/configs/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py b/configs/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py
new file mode 100755
index 0000000..5281c5b
--- /dev/null
+++ b/configs/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py
@@ -0,0 +1,110 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py'
+]
+
+# model settings
+model = dict(
+    type='CornerNet',
+    backbone=dict(
+        type='HourglassNet',
+        downsample_times=5,
+        num_stacks=2,
+        stage_channels=[256, 256, 384, 384, 384, 512],
+        stage_blocks=[2, 2, 2, 2, 2, 4],
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    neck=None,
+    bbox_head=dict(
+        type='CentripetalHead',
+        num_classes=80,
+        in_channels=256,
+        num_feat_levels=2,
+        corner_emb_channels=0,
+        loss_heatmap=dict(
+            type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1),
+        loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1),
+        loss_guiding_shift=dict(
+            type='SmoothL1Loss', beta=1.0, loss_weight=0.05),
+        loss_centripetal_shift=dict(
+            type='SmoothL1Loss', beta=1.0, loss_weight=1)),
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(
+        corner_topk=100,
+        local_maximum_kernel=3,
+        distance_threshold=0.5,
+        score_thr=0.05,
+        max_per_img=100,
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian')))
+# data settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='RandomCenterCropPad',
+        crop_size=(511, 511),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        test_mode=False,
+        test_pad_mode=None,
+        **img_norm_cfg),
+    dict(type='Resize', img_scale=(511, 511), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=True,
+        transforms=[
+            dict(type='Resize'),
+            dict(
+                type='RandomCenterCropPad',
+                crop_size=None,
+                ratios=None,
+                border=None,
+                test_mode=True,
+                test_pad_mode=['logical_or', 127],
+                **img_norm_cfg),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
+                           'scale_factor', 'flip', 'img_norm_cfg', 'border')),
+        ])
+]
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=3,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='Adam', lr=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[190])
+runner = dict(type='EpochBasedRunner', max_epochs=210)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (6 samples per GPU)
+auto_scale_lr = dict(base_batch_size=96)
diff --git a/configs/centripetalnet/metafile.yml b/configs/centripetalnet/metafile.yml
new file mode 100755
index 0000000..61aed3e
--- /dev/null
+++ b/configs/centripetalnet/metafile.yml
@@ -0,0 +1,39 @@
+Collections:
+  - Name: CentripetalNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Adam
+      Training Resources: 16x V100 GPUs
+      Architecture:
+        - Corner Pooling
+        - Stacked Hourglass Network
+    Paper:
+      URL: https://arxiv.org/abs/2003.09119
+      Title: 'CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection'
+    README: configs/centripetalnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/cornernet.py#L9
+      Version: v2.5.0
+
+Models:
+  - Name: centripetalnet_hourglass104_mstest_16x6_210e_coco
+    In Collection: CentripetalNet
+    Config: configs/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco.py
+    Metadata:
+      Batch Size: 96
+      Training Memory (GB): 16.7
+      inference time (ms/im):
+        - value: 270.27
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth
diff --git a/configs/cityscapes/README.md b/configs/cityscapes/README.md
new file mode 100755
index 0000000..c52a79f
--- /dev/null
+++ b/configs/cityscapes/README.md
@@ -0,0 +1,46 @@
+# Cityscapes
+
+> [The Cityscapes Dataset for Semantic Urban Scene Understanding](https://arxiv.org/abs/1604.01685)
+
+<!-- [DATASET] -->
+
+## Abstract
+
+Visual understanding of complex urban street scenes is an enabling factor for a wide range of applications. Object detection has benefited enormously from large-scale datasets, especially in the context of deep learning. For semantic urban scene understanding, however, no current dataset adequately captures the complexity of real-world urban scenes.
+To address this, we introduce Cityscapes, a benchmark suite and large-scale dataset to train and test approaches for pixel-level and instance-level semantic labeling. Cityscapes is comprised of a large, diverse set of stereo video sequences recorded in streets from 50 different cities. 5000 of these images have high quality pixel-level annotations; 20000 additional images have coarse annotations to enable methods that leverage large volumes of weakly-labeled data. Crucially, our effort exceeds previous attempts in terms of dataset size, annotation richness, scene variability, and complexity. Our accompanying empirical study provides an in-depth analysis of the dataset characteristics, as well as a performance evaluation of several state-of-the-art approaches based on our benchmark.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143874154-db4484a5-9211-41f6-852a-b7f0a8c9ec26.png"/>
+</div>
+
+## Common settings
+
+- All baselines were trained using 8 GPU with a batch size of 8 (1 images per GPU) using the [linear scaling rule](https://arxiv.org/abs/1706.02677) to scale the learning rate.
+- All models were trained on `cityscapes_train`, and tested on `cityscapes_val`.
+- 1x training schedule indicates 64 epochs which corresponds to slightly less than the 24k iterations reported in the original schedule from the [Mask R-CNN paper](https://arxiv.org/abs/1703.06870)
+- COCO pre-trained weights are used to initialize.
+- A conversion [script](../../tools/dataset_converters/cityscapes.py) is provided to convert Cityscapes into COCO format. Please refer to [install.md](../../docs/1_exist_data_model.md#prepare-datasets) for details.
+- `CityscapesDataset` implemented three evaluation methods. `bbox` and `segm` are standard COCO bbox/mask AP. `cityscapes` is the cityscapes dataset official evaluation, which may be slightly higher than COCO.
+
+### Faster R-CNN
+
+| Backbone |  Style  | Lr schd |  Scale   | Mem (GB) | Inf time (fps) | box AP |                                                         Config                                                          |                                                                                                                          Download                                                                                                                           |
+| :------: | :-----: | :-----: | :------: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN | pytorch |   1x    | 800-1024 |   5.2    |       -        |  40.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes_20200502-829424c0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes_20200502_114915.log.json) |
+
+### Mask R-CNN
+
+| Backbone |  Style  | Lr schd |  Scale   | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                        Config                                                         |                                                                                                                                                            Download                                                                                                                                                            |
+| :------: | :-----: | :-----: | :------: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN | pytorch |   1x    | 800-1024 |   5.3    |       -        |  40.9  |  36.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes/mask_rcnn_r50_fpn_1x_cityscapes_20201211_133733-d2858245.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes/mask_rcnn_r50_fpn_1x_cityscapes_20201211_133733.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{Cordts2016Cityscapes,
+   title={The Cityscapes Dataset for Semantic Urban Scene Understanding},
+   author={Cordts, Marius and Omran, Mohamed and Ramos, Sebastian and Rehfeld, Timo and Enzweiler, Markus and Benenson, Rodrigo and Franke, Uwe and Roth, Stefan and Schiele, Bernt},
+   booktitle={Proc. of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+   year={2016}
+}
+```
diff --git a/configs/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py b/configs/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py
new file mode 100755
index 0000000..ca636bd
--- /dev/null
+++ b/configs/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes.py
@@ -0,0 +1,44 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_detection.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=8,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    # [7] yields higher performance than [6]
+    step=[7])
+runner = dict(
+    type='EpochBasedRunner', max_epochs=8)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=100)
+# For better, more stable performance initialize from COCO
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py b/configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py
new file mode 100755
index 0000000..83ea058
--- /dev/null
+++ b/configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py
@@ -0,0 +1,51 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=8,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=8,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    # [7] yields higher performance than [6]
+    step=[7])
+runner = dict(
+    type='EpochBasedRunner', max_epochs=8)  # actual epoch = 8 * 8 = 64
+log_config = dict(interval=100)
+# For better, more stable performance initialize from COCO
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/common/lsj_100e_coco_instance.py b/configs/common/lsj_100e_coco_instance.py
new file mode 100755
index 0000000..cacf23d
--- /dev/null
+++ b/configs/common/lsj_100e_coco_instance.py
@@ -0,0 +1,90 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+image_size = (1024, 1024)
+
+file_client_args = dict(backend='disk')
+# comment out the code below to use different file client
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(0.1, 2.0),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=image_size),  # padding to image_size leads 0.5+ mAP
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=4,  # simply change this from 2 to 16 for 50e - 400e training.
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=5, metric=['bbox', 'segm'])
+
+# optimizer assumes bs=64
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004)
+optimizer_config = dict(grad_clip=None)
+
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.067,
+    step=[22, 24])
+runner = dict(type='EpochBasedRunner', max_epochs=25)
diff --git a/configs/common/mstrain-poly_3x_coco_instance.py b/configs/common/mstrain-poly_3x_coco_instance.py
new file mode 100755
index 0000000..c22ed94
--- /dev/null
+++ b/configs/common/mstrain-poly_3x_coco_instance.py
@@ -0,0 +1,80 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric=['bbox', 'segm'])
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+# Experiments show that using step=[9, 11] has higher performance
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[9, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/common/mstrain_3x_coco.py b/configs/common/mstrain_3x_coco.py
new file mode 100755
index 0000000..80ec8b8
--- /dev/null
+++ b/configs/common/mstrain_3x_coco.py
@@ -0,0 +1,76 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+# Experiments show that using step=[9, 11] has higher performance
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[9, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/common/mstrain_3x_coco_instance.py b/configs/common/mstrain_3x_coco_instance.py
new file mode 100755
index 0000000..50f39be
--- /dev/null
+++ b/configs/common/mstrain_3x_coco_instance.py
@@ -0,0 +1,76 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric=['bbox', 'segm'])
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+# Experiments show that using step=[9, 11] has higher performance
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[9, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/common/ssj_270k_coco_instance.py b/configs/common/ssj_270k_coco_instance.py
new file mode 100755
index 0000000..851098f
--- /dev/null
+++ b/configs/common/ssj_270k_coco_instance.py
@@ -0,0 +1,91 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+image_size = (1024, 1024)
+
+file_client_args = dict(backend='disk')
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(0.8, 1.25),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=image_size),  # padding to image_size leads 0.5+ mAP
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=6000, metric=['bbox', 'segm'])
+
+# optimizer assumes batch_size = (32 GPUs) x (2 samples per GPU)
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004)
+optimizer_config = dict(grad_clip=None)
+
+# lr steps at [0.9, 0.95, 0.975] of the maximum iterations
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[243000, 256500, 263250])
+checkpoint_config = dict(interval=6000)
+# The model is trained by 270k iterations with batch_size 64,
+# which is roughly equivalent to 144 epochs.
+runner = dict(type='IterBasedRunner', max_iters=270000)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/common/ssj_scp_270k_coco_instance.py b/configs/common/ssj_scp_270k_coco_instance.py
new file mode 100755
index 0000000..540839f
--- /dev/null
+++ b/configs/common/ssj_scp_270k_coco_instance.py
@@ -0,0 +1,97 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+image_size = (1024, 1024)
+
+file_client_args = dict(backend='disk')
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+load_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(0.8, 1.25),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Pad', size=image_size),
+]
+train_pipeline = [
+    dict(type='CopyPaste', max_num_pasted=100),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='MultiImageMixDataset',
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=load_pipeline),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=6000, metric=['bbox', 'segm'])
+
+# optimizer assumes batch_size = (32 GPUs) x (2 samples per GPU)
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004)
+optimizer_config = dict(grad_clip=None)
+
+# lr steps at [0.9, 0.95, 0.975] of the maximum iterations
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[243000, 256500, 263250])
+checkpoint_config = dict(interval=6000)
+# The model is trained by 270k iterations with batch_size 64,
+# which is roughly equivalent to 144 epochs.
+runner = dict(type='IterBasedRunner', max_iters=270000)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/convnext/README.md b/configs/convnext/README.md
new file mode 100755
index 0000000..edf72e8
--- /dev/null
+++ b/configs/convnext/README.md
@@ -0,0 +1,40 @@
+# ConvNeXt
+
+> [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
+
+## Abstract
+
+The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/8370623/148624004-e9581042-ea4d-4e10-b3bd-42c92b02053b.png" width="90%"/>
+</div>
+
+## Results and models
+
+|       Method       |  Backbone  |  Pretrain   | Lr schd | Multi-scale crop | FP16 | Mem (GB) | box AP | mask AP |                                         Config                                          |                                                                                                                                                                                                                                            Download                                                                                                                                                                                                                                             |
+| :----------------: | :--------: | :---------: | :-----: | :--------------: | :--: | :------: | :----: | :-----: | :-------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     Mask R-CNN     | ConvNeXt-T | ImageNet-1K |   3x    |       yes        | yes  |   7.3    |  46.2  |  41.7   |           [config](./mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco.py)            |                                           [model](https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953-050731f4.pth)  \| [log](https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953.log.json)                                           |
+| Cascade Mask R-CNN | ConvNeXt-T | ImageNet-1K |   3x    |       yes        | yes  |   9.0    |  50.3  |  43.6   | [config](./cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220509_204200-8f07c40b.pth)  \| [log](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220509_204200.log.json) |
+| Cascade Mask R-CNN | ConvNeXt-S | ImageNet-1K |   3x    |       yes        | yes  |   12.3   |  51.8  |  44.8   | [config](./cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004-3d24f5a4.pth)  \| [log](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004.log.json) |
+
+**Note**:
+
+- ConvNeXt backbone needs to install [MMClassification](https://github.com/open-mmlab/mmclassification) first, which has abundant backbones for downstream tasks.
+
+```shell
+pip install mmcls>=0.22.0
+```
+
+- The performance is unstable. `Cascade Mask R-CNN` may fluctuate about 0.2 mAP.
+
+## Citation
+
+```bibtex
+@article{liu2022convnet,
+  title={A ConvNet for the 2020s},
+  author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
+  journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2022}
+}
+```
diff --git a/configs/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py b/configs/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
new file mode 100755
index 0000000..0ccc31d
--- /dev/null
+++ b/configs/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
@@ -0,0 +1,32 @@
+_base_ = './cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py'  # noqa
+
+# please install mmcls>=0.22.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-small_3rdparty_32xb128-noema_in1k_20220301-303e75e3.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmcls.ConvNeXt',
+        arch='small',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.6,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')))
+
+optimizer = dict(
+    _delete_=True,
+    constructor='LearningRateDecayOptimizerConstructor',
+    type='AdamW',
+    lr=0.0002,
+    betas=(0.9, 0.999),
+    weight_decay=0.05,
+    paramwise_cfg={
+        'decay_rate': 0.7,
+        'decay_type': 'layer_wise',
+        'num_layers': 12
+    })
diff --git a/configs/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py b/configs/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
new file mode 100755
index 0000000..93304c0
--- /dev/null
+++ b/configs/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
@@ -0,0 +1,149 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.22.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmcls.ConvNeXt',
+        arch='tiny',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]),
+    roi_head=dict(bbox_head=[
+        dict(
+            type='ConvFCBBoxHead',
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            reg_decoded_bbox=True,
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+        dict(
+            type='ConvFCBBoxHead',
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.05, 0.05, 0.1, 0.1]),
+            reg_class_agnostic=False,
+            reg_decoded_bbox=True,
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+        dict(
+            type='ConvFCBBoxHead',
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.033, 0.033, 0.067, 0.067]),
+            reg_class_agnostic=False,
+            reg_decoded_bbox=True,
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0))
+    ]))
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline), persistent_workers=True)
+
+optimizer = dict(
+    _delete_=True,
+    constructor='LearningRateDecayOptimizerConstructor',
+    type='AdamW',
+    lr=0.0002,
+    betas=(0.9, 0.999),
+    weight_decay=0.05,
+    paramwise_cfg={
+        'decay_rate': 0.7,
+        'decay_type': 'layer_wise',
+        'num_layers': 6
+    })
+
+lr_config = dict(warmup_iters=1000, step=[27, 33])
+runner = dict(max_epochs=36)
+
+# you need to set mode='dynamic' if you are using pytorch<=1.5.0
+fp16 = dict(loss_scale=dict(init_scale=512))
diff --git a/configs/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco.py b/configs/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco.py
new file mode 100755
index 0000000..e8a283f
--- /dev/null
+++ b/configs/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco.py
@@ -0,0 +1,90 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.22.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmcls.ConvNeXt',
+        arch='tiny',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]))
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline), persistent_workers=True)
+
+optimizer = dict(
+    _delete_=True,
+    constructor='LearningRateDecayOptimizerConstructor',
+    type='AdamW',
+    lr=0.0001,
+    betas=(0.9, 0.999),
+    weight_decay=0.05,
+    paramwise_cfg={
+        'decay_rate': 0.95,
+        'decay_type': 'layer_wise',
+        'num_layers': 6
+    })
+
+lr_config = dict(warmup_iters=1000, step=[27, 33])
+runner = dict(max_epochs=36)
+
+# you need to set mode='dynamic' if you are using pytorch<=1.5.0
+fp16 = dict(loss_scale=dict(init_scale=512))
diff --git a/configs/convnext/metafile.yml b/configs/convnext/metafile.yml
new file mode 100755
index 0000000..84e50e8
--- /dev/null
+++ b/configs/convnext/metafile.yml
@@ -0,0 +1,93 @@
+Models:
+  - Name: mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ConvNeXt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953-050731f4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2201.03545
+      Title: 'A ConvNet for the 2020s'
+    README: configs/convnext/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ConvNeXt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 43.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220509_204200-8f07c40b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2201.03545
+      Title: 'A ConvNet for the 2020s'
+    README: configs/convnext/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.25.0
+
+  - Name: cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
+    Metadata:
+      Training Memory (GB): 12.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ConvNeXt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 51.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004-3d24f5a4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2201.03545
+      Title: 'A ConvNet for the 2020s'
+    README: configs/convnext/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.25.0
diff --git a/configs/cornernet/README.md b/configs/cornernet/README.md
new file mode 100755
index 0000000..d0b9e98
--- /dev/null
+++ b/configs/cornernet/README.md
@@ -0,0 +1,43 @@
+# CornerNet
+
+> [Cornernet: Detecting objects as paired keypoints](https://arxiv.org/abs/1808.01244)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We propose CornerNet, a new approach to object detection where we detect an object bounding box as a pair of keypoints, the top-left corner and the bottom-right corner, using a single convolution neural network. By detecting objects as paired keypoints, we eliminate the need for designing a set of anchor boxes commonly used in prior single-stage detectors. In addition to our novel formulation, we introduce corner pooling, a new type of pooling layer that helps the network better localize corners. Experiments show that CornerNet achieves a 42.2% AP on MS COCO, outperforming all existing one-stage detectors.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143876061-4de20768-c812-4b97-b089-944d8db91ca2.png"/>
+</div>
+
+## Results and Models
+
+|     Backbone     |                         Batch Size                          | Step/Total Epochs | Mem (GB) | Inf time (fps) | box AP |                                                              Config                                                               |                                                                                                                                                                                     Download                                                                                                                                                                                     |
+| :--------------: | :---------------------------------------------------------: | :---------------: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HourglassNet-104 | [10 x 5](./cornernet_hourglass104_mstest_10x5_210e_coco.py) |      180/210      |   13.9   |      4.2       |  41.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720-5fefbf1c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720.log.json) |
+| HourglassNet-104 |  [8 x 6](./cornernet_hourglass104_mstest_8x6_210e_coco.py)  |      180/210      |   15.9   |      4.2       |  41.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618.log.json)   |
+| HourglassNet-104 | [32 x 3](./cornernet_hourglass104_mstest_32x3_210e_coco.py) |      180/210      |   9.5    |      3.9       |  40.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110-1efaea91.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110.log.json) |
+
+Note:
+
+- TTA setting is single-scale and `flip=True`.
+- Experiments with `images_per_gpu=6` are conducted on Tesla V100-SXM2-32GB, `images_per_gpu=3` are conducted on GeForce GTX 1080 Ti.
+- Here are the descriptions of each experiment setting:
+  - 10 x 5: 10 GPUs with 5 images per gpu. This is the same setting as that reported in the original paper.
+  - 8 x 6: 8 GPUs with 6 images per gpu. The total batchsize is similar to paper and only need 1 node to train.
+  - 32 x 3: 32 GPUs with 3 images per gpu. The default setting for 1080TI and need 4 nodes to train.
+
+## Citation
+
+```latex
+@inproceedings{law2018cornernet,
+  title={Cornernet: Detecting objects as paired keypoints},
+  author={Law, Hei and Deng, Jia},
+  booktitle={15th European Conference on Computer Vision, ECCV 2018},
+  pages={765--781},
+  year={2018},
+  organization={Springer Verlag}
+}
+```
diff --git a/configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py b/configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py
new file mode 100755
index 0000000..6cb05a7
--- /dev/null
+++ b/configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py
@@ -0,0 +1,110 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py'
+]
+
+# model settings
+model = dict(
+    type='CornerNet',
+    backbone=dict(
+        type='HourglassNet',
+        downsample_times=5,
+        num_stacks=2,
+        stage_channels=[256, 256, 384, 384, 384, 512],
+        stage_blocks=[2, 2, 2, 2, 2, 4],
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    neck=None,
+    bbox_head=dict(
+        type='CornerHead',
+        num_classes=80,
+        in_channels=256,
+        num_feat_levels=2,
+        corner_emb_channels=1,
+        loss_heatmap=dict(
+            type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1),
+        loss_embedding=dict(
+            type='AssociativeEmbeddingLoss',
+            pull_weight=0.10,
+            push_weight=0.10),
+        loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)),
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(
+        corner_topk=100,
+        local_maximum_kernel=3,
+        distance_threshold=0.5,
+        score_thr=0.05,
+        max_per_img=100,
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian')))
+# data settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='RandomCenterCropPad',
+        crop_size=(511, 511),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        test_mode=False,
+        test_pad_mode=None,
+        **img_norm_cfg),
+    dict(type='Resize', img_scale=(511, 511), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=True,
+        transforms=[
+            dict(type='Resize'),
+            dict(
+                type='RandomCenterCropPad',
+                crop_size=None,
+                ratios=None,
+                border=None,
+                test_mode=True,
+                test_pad_mode=['logical_or', 127],
+                **img_norm_cfg),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
+                           'scale_factor', 'flip', 'img_norm_cfg', 'border')),
+        ])
+]
+data = dict(
+    samples_per_gpu=5,
+    workers_per_gpu=3,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='Adam', lr=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[180])
+runner = dict(type='EpochBasedRunner', max_epochs=210)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (10 GPUs) x (5 samples per GPU)
+auto_scale_lr = dict(base_batch_size=50)
diff --git a/configs/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py b/configs/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py
new file mode 100755
index 0000000..f539cdb
--- /dev/null
+++ b/configs/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py
@@ -0,0 +1,110 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py'
+]
+
+# model settings
+model = dict(
+    type='CornerNet',
+    backbone=dict(
+        type='HourglassNet',
+        downsample_times=5,
+        num_stacks=2,
+        stage_channels=[256, 256, 384, 384, 384, 512],
+        stage_blocks=[2, 2, 2, 2, 2, 4],
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    neck=None,
+    bbox_head=dict(
+        type='CornerHead',
+        num_classes=80,
+        in_channels=256,
+        num_feat_levels=2,
+        corner_emb_channels=1,
+        loss_heatmap=dict(
+            type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1),
+        loss_embedding=dict(
+            type='AssociativeEmbeddingLoss',
+            pull_weight=0.10,
+            push_weight=0.10),
+        loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)),
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(
+        corner_topk=100,
+        local_maximum_kernel=3,
+        distance_threshold=0.5,
+        score_thr=0.05,
+        max_per_img=100,
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian')))
+# data settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='RandomCenterCropPad',
+        crop_size=(511, 511),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        test_mode=False,
+        test_pad_mode=None,
+        **img_norm_cfg),
+    dict(type='Resize', img_scale=(511, 511), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=True,
+        transforms=[
+            dict(type='Resize'),
+            dict(
+                type='RandomCenterCropPad',
+                crop_size=None,
+                ratios=None,
+                border=None,
+                test_mode=True,
+                test_pad_mode=['logical_or', 127],
+                **img_norm_cfg),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
+                           'scale_factor', 'flip', 'img_norm_cfg', 'border')),
+        ])
+]
+data = dict(
+    samples_per_gpu=3,
+    workers_per_gpu=3,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='Adam', lr=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[180])
+runner = dict(type='EpochBasedRunner', max_epochs=210)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (3 samples per GPU)
+auto_scale_lr = dict(base_batch_size=96)
diff --git a/configs/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py b/configs/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py
new file mode 100755
index 0000000..9b115d7
--- /dev/null
+++ b/configs/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py
@@ -0,0 +1,110 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py'
+]
+
+# model settings
+model = dict(
+    type='CornerNet',
+    backbone=dict(
+        type='HourglassNet',
+        downsample_times=5,
+        num_stacks=2,
+        stage_channels=[256, 256, 384, 384, 384, 512],
+        stage_blocks=[2, 2, 2, 2, 2, 4],
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    neck=None,
+    bbox_head=dict(
+        type='CornerHead',
+        num_classes=80,
+        in_channels=256,
+        num_feat_levels=2,
+        corner_emb_channels=1,
+        loss_heatmap=dict(
+            type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1),
+        loss_embedding=dict(
+            type='AssociativeEmbeddingLoss',
+            pull_weight=0.10,
+            push_weight=0.10),
+        loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)),
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(
+        corner_topk=100,
+        local_maximum_kernel=3,
+        distance_threshold=0.5,
+        score_thr=0.05,
+        max_per_img=100,
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian')))
+# data settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='RandomCenterCropPad',
+        crop_size=(511, 511),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        test_mode=False,
+        test_pad_mode=None,
+        **img_norm_cfg),
+    dict(type='Resize', img_scale=(511, 511), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=True,
+        transforms=[
+            dict(type='Resize'),
+            dict(
+                type='RandomCenterCropPad',
+                crop_size=None,
+                ratios=None,
+                border=None,
+                test_mode=True,
+                test_pad_mode=['logical_or', 127],
+                **img_norm_cfg),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
+                           'scale_factor', 'flip', 'img_norm_cfg', 'border')),
+        ])
+]
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=3,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='Adam', lr=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[180])
+runner = dict(type='EpochBasedRunner', max_epochs=210)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (6 samples per GPU)
+auto_scale_lr = dict(base_batch_size=48)
diff --git a/configs/cornernet/metafile.yml b/configs/cornernet/metafile.yml
new file mode 100755
index 0000000..c2f6143
--- /dev/null
+++ b/configs/cornernet/metafile.yml
@@ -0,0 +1,83 @@
+Collections:
+  - Name: CornerNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Adam
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Corner Pooling
+        - Stacked Hourglass Network
+    Paper:
+      URL: https://arxiv.org/abs/1808.01244
+      Title: 'CornerNet: Detecting Objects as Paired Keypoints'
+    README: configs/cornernet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.3.0/mmdet/models/detectors/cornernet.py#L9
+      Version: v2.3.0
+
+Models:
+  - Name: cornernet_hourglass104_mstest_10x5_210e_coco
+    In Collection: CornerNet
+    Config: configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py
+    Metadata:
+      Training Resources: 10x V100 GPUs
+      Batch Size: 50
+      Training Memory (GB): 13.9
+      inference time (ms/im):
+        - value: 238.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720-5fefbf1c.pth
+
+  - Name: cornernet_hourglass104_mstest_8x6_210e_coco
+    In Collection: CornerNet
+    Config: configs/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco.py
+    Metadata:
+      Batch Size: 48
+      Training Memory (GB): 15.9
+      inference time (ms/im):
+        - value: 238.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth
+
+  - Name: cornernet_hourglass104_mstest_32x3_210e_coco
+    In Collection: CornerNet
+    Config: configs/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco.py
+    Metadata:
+      Training Resources: 32x V100 GPUs
+      Batch Size: 96
+      Training Memory (GB): 9.5
+      inference time (ms/im):
+        - value: 256.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110-1efaea91.pth
diff --git a/configs/dcn/README.md b/configs/dcn/README.md
new file mode 100755
index 0000000..745b01c
--- /dev/null
+++ b/configs/dcn/README.md
@@ -0,0 +1,48 @@
+# DCN
+
+> [Deformable Convolutional Networks](https://arxiv.org/abs/1703.06211)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Convolutional neural networks (CNNs) are inherently limited to model geometric transformations due to the fixed geometric structures in its building modules. In this work, we introduce two new modules to enhance the transformation modeling capacity of CNNs, namely, deformable convolution and deformable RoI pooling. Both are based on the idea of augmenting the spatial sampling locations in the modules with additional offsets and learning the offsets from target tasks, without additional supervision. The new modules can readily replace their plain counterparts in existing CNNs and can be easily trained end-to-end by standard back-propagation, giving rise to deformable convolutional networks. Extensive experiments validate the effectiveness of our approach on sophisticated vision tasks of object detection and semantic segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143876246-c4985e25-e286-4511-9b7c-97af2857461e.png"/>
+</div>
+
+## Results and Models
+
+|    Backbone     |    Model     |  Style  |     Conv     | Pool  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                               Config                                                                |                                                                                                                                                                                       Download                                                                                                                                                                                       |
+| :-------------: | :----------: | :-----: | :----------: | :---: | :-----: | :------: | :------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     |    Faster    | pytorch | dconv(c3-c5) |   -   |   1x    |   4.0    |      17.8      |  41.3  |         |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py)        |                      [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130_212941.log.json)                       |
+|    R-50-FPN     |    Faster    | pytorch |      -       | dpool |   1x    |   5.0    |      17.2      |  38.9  |         |          [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py)           |                                  [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307-90d3c01d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307_203250.log.json)                                   |
+|    R-101-FPN    |    Faster    | pytorch | dconv(c3-c5) |   -   |   1x    |   6.0    |      12.5      |  42.7  |         |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py)       |                    [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-1377f13d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203_230019.log.json)                     |
+| X-101-32x4d-FPN |    Faster    | pytorch | dconv(c3-c5) |   -   |   1x    |   7.3    |      10.0      |  44.5  |         |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py)    |        [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203-4f85c69c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203_001325.log.json)         |
+|    R-50-FPN     |     Mask     | pytorch | dconv(c3-c5) |   -   |   1x    |   4.5    |      15.4      |  41.8  |  37.4   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py)         |                          [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203-4d9ad43b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203_061339.log.json)                           |
+|    R-101-FPN    |     Mask     | pytorch | dconv(c3-c5) |   -   |   1x    |   6.5    |      11.7      |  43.5  |  38.9   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py)        |                        [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216-a71f5bce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216_191601.log.json)                         |
+|    R-50-FPN     |   Cascade    | pytorch | dconv(c3-c5) |   -   |   1x    |   4.5    |      14.6      |  43.8  |         |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py)       |                    [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-2f1fca44.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130_220843.log.json)                     |
+|    R-101-FPN    |   Cascade    | pytorch | dconv(c3-c5) |   -   |   1x    |   6.4    |      11.0      |  45.0  |         |      [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py)       |                  [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-3b2f0594.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203_224829.log.json)                   |
+|    R-50-FPN     | Cascade Mask | pytorch | dconv(c3-c5) |   -   |   1x    |   6.0    |      10.0      |  44.4  |  38.6   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py)     |          [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202-42e767a2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202_010309.log.json)           |
+|    R-101-FPN    | Cascade Mask | pytorch | dconv(c3-c5) |   -   |   1x    |   8.0    |      8.6       |  45.8  |  39.7   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py)    |        [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204-df0c5f10.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204_134006.log.json)         |
+| X-101-32x4d-FPN | Cascade Mask | pytorch | dconv(c3-c5) |   -   |   1x    |   9.2    |                |  47.3  |  41.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-e75f90c8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-20200606_183737.log.json) |
+| R-50-FPN (FP16) |     Mask     | pytorch | dconv(c3-c5) |   -   |   1x    |   3.0    |                |  41.9  |  37.5   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py)      |            [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247-c06429d2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247.log.json)            |
+
+**Notes:**
+
+- `dconv` denotes deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `dpool` denotes deformable roi pooling.
+- The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster.
+- (\*) For R-50-FPN (dg=4), dg is short for deformable_group. This model is trained and tested on Amazon EC2 p3dn.24xlarge instance.
+- **Memory, Train/Inf time is outdated.**
+
+## Citation
+
+```latex
+@inproceedings{dai2017deformable,
+  title={Deformable Convolutional Networks},
+  author={Dai, Jifeng and Qi, Haozhi and Xiong, Yuwen and Li, Yi and Zhang, Guodong and Hu, Han and Wei, Yichen},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  year={2017}
+}
+```
diff --git a/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..081b998
--- /dev/null
+++ b/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..3b3683a
--- /dev/null
+++ b/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..daaa472
--- /dev/null
+++ b/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..a01df33
--- /dev/null
+++ b/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..aa664bd
--- /dev/null
+++ b/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..f5fee7e
--- /dev/null
+++ b/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..8787088
--- /dev/null
+++ b/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py b/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py
new file mode 100755
index 0000000..1b695f0
--- /dev/null
+++ b/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                _delete_=True,
+                type='DeformRoIPoolPack',
+                output_size=7,
+                output_channels=256),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32])))
diff --git a/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..e3bea19
--- /dev/null
+++ b/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..cb34002
--- /dev/null
+++ b/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..ababe58
--- /dev/null
+++ b/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py b/configs/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..ee5cca7
--- /dev/null
+++ b/configs/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
+
+fp16 = dict(loss_scale=512.)
diff --git a/configs/dcn/metafile.yml b/configs/dcn/metafile.yml
new file mode 100755
index 0000000..36f3887
--- /dev/null
+++ b/configs/dcn/metafile.yml
@@ -0,0 +1,272 @@
+Collections:
+  - Name: Deformable Convolutional Networks
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Deformable Convolution
+    Paper:
+      URL: https://arxiv.org/abs/1703.06211
+      Title: "Deformable Convolutional Networks"
+    README: configs/dcn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15
+      Version: v2.0.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 56.18
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth
+
+  - Name: faster_rcnn_r50_fpn_dpool_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      inference time (ms/im):
+        - value: 58.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307-90d3c01d.pth
+
+  - Name: faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 80
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-1377f13d.pth
+
+  - Name: faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      inference time (ms/im):
+        - value: 100
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203-4f85c69c.pth
+
+  - Name: mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      inference time (ms/im):
+        - value: 64.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203-4d9ad43b.pth
+
+  - Name: mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      Training Memory (GB): 3.0
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247-c06429d2.pth
+
+  - Name: mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216-a71f5bce.pth
+
+  - Name: cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      inference time (ms/im):
+        - value: 68.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-2f1fca44.pth
+
+  - Name: cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-3b2f0594.pth
+
+  - Name: cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 100
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202-42e767a2.pth
+
+  - Name: cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204-df0c5f10.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-e75f90c8.pth
diff --git a/configs/dcnv2/README.md b/configs/dcnv2/README.md
new file mode 100755
index 0000000..d230f20
--- /dev/null
+++ b/configs/dcnv2/README.md
@@ -0,0 +1,37 @@
+# DCNv2
+
+> [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The superior performance of Deformable Convolutional Networks arises from its ability to adapt to the geometric variations of objects. Through an examination of its adaptive behavior, we observe that while the spatial support for its neural features conforms more closely than regular ConvNets to object structure, this support may nevertheless extend well beyond the region of interest, causing features to be influenced by irrelevant image content. To address this problem, we present a reformulation of Deformable ConvNets that improves its ability to focus on pertinent image regions, through increased modeling power and stronger training. The modeling power is enhanced through a more comprehensive integration of deformable convolution within the network, and by introducing a modulation mechanism that expands the scope of deformation modeling. To effectively harness this enriched modeling capability, we guide network training via a proposed feature mimicking scheme that helps the network to learn features that reflect the object focus and classification power of RCNN features. With the proposed contributions, this new version of Deformable ConvNets yields significant performance gains over the original model and produces leading results on the COCO benchmark for object detection and instance segmentation.
+
+## Results and Models
+
+|     Backbone      | Model  |  Style  |     Conv      |  Pool  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                              Config                                                              |                                                                                                                                                                                 Download                                                                                                                                                                                  |
+| :---------------: | :----: | :-----: | :-----------: | :----: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     R-50-FPN      | Faster | pytorch | mdconv(c3-c5) |   -    |   1x    |   4.1    |      17.6      |  41.4  |         |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130-d099253b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130_222144.log.json)               |
+| \*R-50-FPN (dg=4) | Faster | pytorch | mdconv(c3-c5) |   -    |   1x    |   4.2    |      17.4      |  41.5  |         | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130-01262257.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130_222058.log.json) |
+|     R-50-FPN      | Faster | pytorch |       -       | mdpool |   1x    |   5.8    |      16.6      |  38.7  |         |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307_203304.log.json)                           |
+|     R-50-FPN      |  Mask  | pytorch | mdconv(c3-c5) |   -    |   1x    |   4.5    |      15.1      |  41.5  |  37.1   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203-ad97591f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203_063443.log.json)                   |
+|  R-50-FPN (FP16)  |  Mask  | pytorch | mdconv(c3-c5) |   -    |   1x    |   3.1    |                |  42.0  |  37.6   |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py)    |    [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434-cf8fefa5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434.log.json)     |
+
+**Notes:**
+
+- `mdconv` denotes modulated deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `mdpool` denotes modulated deformable roi pooling.
+- The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster.
+- (\*) For R-50-FPN (dg=4), dg is short for deformable_group. This model is trained and tested on Amazon EC2 p3dn.24xlarge instance.
+- **Memory, Train/Inf time is outdated.**
+
+## Citation
+
+```latex
+@article{zhu2018deformable,
+  title={Deformable ConvNets v2: More Deformable, Better Results},
+  author={Zhu, Xizhou and Hu, Han and Lin, Stephen and Dai, Jifeng},
+  journal={arXiv preprint arXiv:1811.11168},
+  year={2018}
+}
+```
diff --git a/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py b/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..d1bcf3c
--- /dev/null
+++ b/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py b/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py
new file mode 100755
index 0000000..d0ab89c
--- /dev/null
+++ b/configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=4, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py b/configs/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py
new file mode 100755
index 0000000..ad7b034
--- /dev/null
+++ b/configs/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                _delete_=True,
+                type='ModulatedDeformRoIPoolPack',
+                output_size=7,
+                output_channels=256),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32])))
diff --git a/configs/dcnv2/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py b/configs/dcnv2/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..7e21454
--- /dev/null
+++ b/configs/dcnv2/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
+
+fp16 = dict(loss_scale=512.)
diff --git a/configs/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py b/configs/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..5ca2a67
--- /dev/null
+++ b/configs/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcnv2/metafile.yml b/configs/dcnv2/metafile.yml
new file mode 100755
index 0000000..f6d5381
--- /dev/null
+++ b/configs/dcnv2/metafile.yml
@@ -0,0 +1,123 @@
+Collections:
+  - Name: Deformable Convolutional Networks v2
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Deformable Convolution
+    Paper:
+      URL: https://arxiv.org/abs/1811.11168
+      Title: "Deformable ConvNets v2: More Deformable, Better Results"
+    README: configs/dcnv2/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15
+      Version: v2.0.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      inference time (ms/im):
+        - value: 56.82
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130-d099253b.pth
+
+  - Name: faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 57.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130-01262257.pth
+
+  - Name: faster_rcnn_r50_fpn_mdpool_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/faster_rcnn_r50_fpn_mdpool_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.8
+      inference time (ms/im):
+        - value: 60.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth
+
+  - Name: mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      inference time (ms/im):
+        - value: 66.23
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203-ad97591f.pth
+
+  - Name: mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.1
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434-cf8fefa5.pth
diff --git a/configs/ddod/README.md b/configs/ddod/README.md
new file mode 100755
index 0000000..9ab1f48
--- /dev/null
+++ b/configs/ddod/README.md
@@ -0,0 +1,31 @@
+# DDOD
+
+> [Disentangle Your Dense Object Detector](https://arxiv.org/pdf/2107.02963.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Deep learning-based dense object detectors have achieved great success in the past few years and have been applied to numerous multimedia applications such as video understanding. However, the current training pipeline for dense detectors is compromised to lots of conjunctions that may not hold. In this paper, we investigate three such important conjunctions: 1) only samples assigned as positive in classification head are used to train the regression head; 2) classification and regression share the same input feature and computational fields defined by the parallel head architecture; and 3) samples distributed in different feature pyramid layers are treated equally when computing the loss. We first carry out a series of pilot experiments to show disentangling such conjunctions can lead to persistent performance improvement. Then, based on these findings, we propose Disentangled Dense Object Detector(DDOD), in which simple and effective disentanglement mechanisms are designed and integrated into the current state-of-the-art dense object detectors. Extensive experiments on MS COCO benchmark show that our approach can lead to 2.0 mAP, 2.4 mAP and 2.2 mAP absolute improvements on RetinaNet, FCOS, and ATSS baselines with negligible extra overhead. Notably, our best model reaches 55.0 mAP on the COCO test-dev set and 93.5 AP on the hard subset of WIDER FACE, achieving new state-of-the-art performance on these two competitive benchmarks. Code is available at https://github.com/zehuichen123/DDOD.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/17425982/159212920-2e99d433-82c9-46cf-8f3a-32fdf3c566f5.png"/>
+</div>
+
+## Results and Models
+
+|   Model   | Backbone |  Style  | Lr schd | Mem (GB) | box AP |                                                Config                                                |                                                                                                                                Download                                                                                                                                |
+| :-------: | :------: | :-----: | :-----: | :------: | :----: | :--------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| DDOD-ATSS |   R-50   | pytorch |   1x    |   3.4    |  41.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ddod/ddod_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737-29b2fc67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{chen2021disentangle,
+title={Disentangle Your Dense Object Detector},
+author={Chen, Zehui and Yang, Chenhongyi and Li, Qiaofei and Zhao, Feng and Zha, Zheng-Jun and Wu, Feng},
+booktitle={Proceedings of the 29th ACM International Conference on Multimedia},
+pages={4939--4948},
+year={2021}
+}
+```
diff --git a/configs/ddod/ddod_r50_fpn_1x_coco.py b/configs/ddod/ddod_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..02dd2fe
--- /dev/null
+++ b/configs/ddod/ddod_r50_fpn_1x_coco.py
@@ -0,0 +1,67 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='DDOD',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='DDODHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_iou=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    train_cfg=dict(
+        # assigner is mean cls_assigner
+        assigner=dict(type='ATSSAssigner', topk=9, alpha=0.8),
+        reg_assigner=dict(type='ATSSAssigner', topk=9, alpha=0.5),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# This `persistent_workers` is only valid when PyTorch>=1.7.0
+data = dict(persistent_workers=True)
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/ddod/metafile.yml b/configs/ddod/metafile.yml
new file mode 100755
index 0000000..c223950
--- /dev/null
+++ b/configs/ddod/metafile.yml
@@ -0,0 +1,33 @@
+Collections:
+  - Name: DDOD
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - DDOD
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/pdf/2107.02963.pdf
+      Title: 'Disentangle Your Dense Object Detector'
+    README: configs/ddod/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.25.0/mmdet/models/detectors/ddod.py#L6
+      Version: v2.25.0
+
+Models:
+  - Name: ddod_r50_fpn_1x_coco
+    In Collection: DDOD
+    Config: configs/ddod/ddod_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.4
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737-29b2fc67.pth
diff --git a/configs/deepfashion/README.md b/configs/deepfashion/README.md
new file mode 100755
index 0000000..45daec0
--- /dev/null
+++ b/configs/deepfashion/README.md
@@ -0,0 +1,70 @@
+# DeepFashion
+
+> [DeepFashion: Powering Robust Clothes Recognition and Retrieval With Rich Annotations](https://openaccess.thecvf.com/content_cvpr_2016/html/Liu_DeepFashion_Powering_Robust_CVPR_2016_paper.html)
+
+<!-- [DATASET] -->
+
+## Abstract
+
+Recent advances in clothes recognition have been driven by the construction of clothes datasets. Existing datasets are limited in the amount of annotations and are difficult to cope with the various challenges in real-world applications. In this work, we introduce DeepFashion, a large-scale clothes dataset with comprehensive annotations. It contains over 800,000 images, which are richly annotated with massive attributes, clothing landmarks, and correspondence of images taken under different scenarios including store, street snapshot, and consumer. Such rich annotations enable the development of powerful algorithms in clothes recognition and facilitating future researches. To demonstrate the advantages of DeepFashion, we propose a new deep model, namely FashionNet, which learns clothing features by jointly predicting clothing attributes and landmarks. The estimated landmarks are then employed to pool or gate the learned features. It is optimized in an iterative manner. Extensive experiments demonstrate the effectiveness of FashionNet and the usefulness of DeepFashion.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143876310-08470a6a-ea3a-4ec1-a6f2-8ec5df36a8a0.png"/>
+</div>
+
+## Introduction
+
+[MMFashion](https://github.com/open-mmlab/mmfashion) develops "fashion parsing and segmentation" module
+based on the dataset
+[DeepFashion-Inshop](https://drive.google.com/drive/folders/0B7EVK8r0v71pVDZFQXRsMDZCX1E?usp=sharing).
+Its annotation follows COCO style.
+To use it, you need to first download the data. Note that we only use "img_highres" in this task.
+The file tree should be like this:
+
+```sh
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── DeepFashion
+│   │   ├── In-shop
+│   │   ├── Anno
+│   │   │   ├── segmentation
+│   │   │   |   ├── DeepFashion_segmentation_train.json
+│   │   │   |   ├── DeepFashion_segmentation_query.json
+│   │   │   |   ├── DeepFashion_segmentation_gallery.json
+│   │   │   ├── list_bbox_inshop.txt
+│   │   │   ├── list_description_inshop.json
+│   │   │   ├── list_item_inshop.txt
+│   │   │   └── list_landmarks_inshop.txt
+│   │   ├── Eval
+│   │   │   └── list_eval_partition.txt
+│   │   ├── Img
+│   │   │   ├── img
+│   │   │   │   ├──XXX.jpg
+│   │   │   ├── img_highres
+│   │   │   └── ├──XXX.jpg
+
+```
+
+After that you can train the Mask RCNN r50 on DeepFashion-In-shop dataset by launching training with the `mask_rcnn_r50_fpn_1x.py` config
+or creating your own config file.
+
+## Results and Models
+
+| Backbone | Model type |       Dataset       | bbox detection Average Precision | segmentation Average Precision |                                                          Config                                                          |                                                                                                                                       Download (Google)                                                                                                                                       |
+| :------: | :--------: | :-----------------: | :------------------------------: | :----------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ResNet50 | Mask RCNN  | DeepFashion-In-shop |              0.599               |             0.584              | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion/mask_rcnn_r50_fpn_15e_deepfashion_20200329_192752.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion/20200329_192752.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{liuLQWTcvpr16DeepFashion,
+   author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
+   title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
+   booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+   month = {June},
+   year = {2016}
+}
+```
diff --git a/configs/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py b/configs/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py
new file mode 100755
index 0000000..c4e8638
--- /dev/null
+++ b/configs/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/deepfashion.py', '../_base_/schedules/schedule_1x.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=15), mask_head=dict(num_classes=15)))
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=15)
diff --git a/configs/deformable_detr/README.md b/configs/deformable_detr/README.md
new file mode 100755
index 0000000..378e1f2
--- /dev/null
+++ b/configs/deformable_detr/README.md
@@ -0,0 +1,41 @@
+# Deformable DETR
+
+> [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143877617-ad9b24fd-77ce-46aa-9689-1a44b5594132.png"/>
+</div>
+
+## Results and Models
+
+| Backbone |                Model                | Lr schd | box AP |                                                                    Config                                                                    |                                                                                                                                                                                                         Download                                                                                                                                                                                                          |
+| :------: | :---------------------------------: | :-----: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   |           Deformable DETR           |   50e   |  44.5  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py)         |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_r50_16x2_50e_coco/deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_r50_16x2_50e_coco/deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.log.json)                                 |
+|   R-50   | + iterative bounding box refinement |   50e   |  46.1  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco/deformable_detr_refine_r50_16x2_50e_coco_20210419_220503-5f5dff21.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco/deformable_detr_refine_r50_16x2_50e_coco_20210419_220503-5f5dff21.log.json)                   |
+|   R-50   |    ++ two-stage Deformable DETR     |   50e   |  46.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco/deformable_detr_twostage_refine_r50_16x2_50e_coco_20210419_220613-9d28ab72.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco/deformable_detr_twostage_refine_r50_16x2_50e_coco_20210419_220613-9d28ab72.log.json) |
+
+# NOTE
+
+1. All models are trained with batch size 32.
+2. The performance is unstable. `Deformable DETR` and `iterative bounding box refinement` may fluctuate about 0.3 mAP. `two-stage Deformable DETR` may fluctuate about 0.2 mAP.
+
+## Citation
+
+We provide the config files for Deformable DETR: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159).
+
+```latex
+@inproceedings{
+zhu2021deformable,
+title={Deformable DETR: Deformable Transformers for End-to-End Object Detection},
+author={Xizhou Zhu and Weijie Su and Lewei Lu and Bin Li and Xiaogang Wang and Jifeng Dai},
+booktitle={International Conference on Learning Representations},
+year={2021},
+url={https://openreview.net/forum?id=gZ9hCDWe6ke}
+}
+```
diff --git a/configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py b/configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py
new file mode 100755
index 0000000..c64d09f
--- /dev/null
+++ b/configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py
@@ -0,0 +1,177 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DeformableDETR',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    bbox_head=dict(
+        type='DeformableDETRHead',
+        num_query=300,
+        num_classes=80,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='DeformableDetrTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention', embed_dims=256),
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=256)
+                    ],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=100))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='Resize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original impl
+                    img_scale=[(400, 4200), (500, 4200), (600, 4200)],
+                    multiscale_mode='value',
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    override=True,
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=1),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(filter_empty_gt=False, pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1)
+        }))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[40])
+runner = dict(type='EpochBasedRunner', max_epochs=50)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/configs/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py b/configs/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py
new file mode 100755
index 0000000..01f13df
--- /dev/null
+++ b/configs/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py
@@ -0,0 +1,2 @@
+_base_ = 'deformable_detr_r50_16x2_50e_coco.py'
+model = dict(bbox_head=dict(with_box_refine=True))
diff --git a/configs/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py b/configs/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py
new file mode 100755
index 0000000..2aa840d
--- /dev/null
+++ b/configs/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py
@@ -0,0 +1,2 @@
+_base_ = 'deformable_detr_refine_r50_16x2_50e_coco.py'
+model = dict(bbox_head=dict(as_two_stage=True))
diff --git a/configs/deformable_detr/metafile.yml b/configs/deformable_detr/metafile.yml
new file mode 100755
index 0000000..873292d
--- /dev/null
+++ b/configs/deformable_detr/metafile.yml
@@ -0,0 +1,56 @@
+Collections:
+  - Name: Deformable DETR
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://openreview.net/forum?id=gZ9hCDWe6ke
+      Title: 'Deformable DETR: Deformable Transformers for End-to-End Object Detection'
+    README: configs/deformable_detr/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/deformable_detr.py#L6
+      Version: v2.12.0
+
+Models:
+  - Name: deformable_detr_r50_16x2_50e_coco
+    In Collection: Deformable DETR
+    Config: configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_r50_16x2_50e_coco/deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.pth
+
+  - Name: deformable_detr_refine_r50_16x2_50e_coco
+    In Collection: Deformable DETR
+    Config: configs/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_refine_r50_16x2_50e_coco/deformable_detr_refine_r50_16x2_50e_coco_20210419_220503-5f5dff21.pth
+
+  - Name: deformable_detr_twostage_refine_r50_16x2_50e_coco
+    In Collection: Deformable DETR
+    Config: configs/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_twostage_refine_r50_16x2_50e_coco/deformable_detr_twostage_refine_r50_16x2_50e_coco_20210419_220613-9d28ab72.pth
diff --git a/configs/detectors/README.md b/configs/detectors/README.md
new file mode 100755
index 0000000..baa245f
--- /dev/null
+++ b/configs/detectors/README.md
@@ -0,0 +1,69 @@
+# DetectoRS
+
+> [DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution](https://arxiv.org/abs/2006.02334)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Many modern object detectors demonstrate outstanding performances by using the mechanism of looking and thinking twice. In this paper, we explore this mechanism in the backbone design for object detection. At the macro level, we propose Recursive Feature Pyramid, which incorporates extra feedback connections from Feature Pyramid Networks into the bottom-up backbone layers. At the micro level, we propose Switchable Atrous Convolution, which convolves the features with different atrous rates and gathers the results using switch functions. Combining them results in DetectoRS, which significantly improves the performances of object detection. On COCO test-dev, DetectoRS achieves state-of-the-art 55.7% box AP for object detection, 48.5% mask AP for instance segmentation, and 50.0% PQ for panoptic segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143877901-24451581-2c50-4a54-b000-c4cb111e29ad.png"/>
+</div>
+
+## Introduction
+
+DetectoRS requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path.
+The directory should be like this.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+|   |   ├── stuffthingmaps
+```
+
+## Results and Models
+
+DetectoRS includes two major components:
+
+- Recursive Feature Pyramid (RFP).
+- Switchable Atrous Convolution (SAC).
+
+They can be used independently.
+Combining them together results in DetectoRS.
+The results on COCO 2017 val are shown in the below table.
+
+|  Method   |      Detector       | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                         Config                                                          |                                                                                                                                                         Download                                                                                                                                                         |
+| :-------: | :-----------------: | :-----: | :------: | :------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    RFP    | Cascade + ResNet-50 |   1x    |   7.5    |       -        |  44.8  |         |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/cascade_rcnn_r50_rfp_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco-8cf51bfd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco_20200624_104126.log.json)             |
+|    SAC    | Cascade + ResNet-50 |   1x    |   5.6    |       -        |  45.0  |         |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/cascade_rcnn_r50_sac_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco-24bfda62.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco_20200624_104402.log.json)             |
+| DetectoRS | Cascade + ResNet-50 |   1x    |   9.9    |       -        |  47.4  |         | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/detectors_cascade_rcnn_r50_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco-32a10ba0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco_20200706_001203.log.json) |
+|    RFP    |   HTC + ResNet-50   |   1x    |   11.2   |       -        |  46.6  |  40.9   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/htc_r50_rfp_1x_coco.py)         |                               [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco-8ff87c51.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco_20200624_103053.log.json)                               |
+|    SAC    |   HTC + ResNet-50   |   1x    |   9.3    |       -        |  46.4  |  40.9   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/htc_r50_sac_1x_coco.py)         |                               [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco-bfa60c54.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco_20200624_103111.log.json)                               |
+| DetectoRS |   HTC + ResNet-50   |   1x    |   13.6   |       -        |  49.1  |  42.6   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/detectors_htc_r50_1x_coco.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco_20200624_103659.log.json)                   |
+| DetectoRS |  HTC + ResNet-101   |   20e   |   19.6   |                |  50.5  |  43.9   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detectors/detectors_htc_r101_20e_coco.py)     |       [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r101_20e_coco/detectors_htc_r101_20e_coco_20210419_203638-348d533b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r101_20e_coco/detectors_htc_r101_20e_coco_20210419_203638.log.json)       |
+
+*Note*: This is a re-implementation based on MMDetection-V2.
+The original implementation is based on MMDetection-V1.
+
+## Citation
+
+We provide the config files for [DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution](https://arxiv.org/pdf/2006.02334.pdf).
+
+```latex
+@article{qiao2020detectors,
+  title={DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution},
+  author={Qiao, Siyuan and Chen, Liang-Chieh and Yuille, Alan},
+  journal={arXiv preprint arXiv:2006.02334},
+  year={2020}
+}
+```
diff --git a/configs/detectors/cascade_rcnn_r50_rfp_1x_coco.py b/configs/detectors/cascade_rcnn_r50_rfp_1x_coco.py
new file mode 100755
index 0000000..4430d8a
--- /dev/null
+++ b/configs/detectors/cascade_rcnn_r50_rfp_1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = [
+    '../_base_/models/cascade_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/configs/detectors/cascade_rcnn_r50_sac_1x_coco.py b/configs/detectors/cascade_rcnn_r50_sac_1x_coco.py
new file mode 100755
index 0000000..ccd9319
--- /dev/null
+++ b/configs/detectors/cascade_rcnn_r50_sac_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/cascade_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True)))
diff --git a/configs/detectors/detectors_cascade_rcnn_r50_1x_coco.py b/configs/detectors/detectors_cascade_rcnn_r50_1x_coco.py
new file mode 100755
index 0000000..f760404
--- /dev/null
+++ b/configs/detectors/detectors_cascade_rcnn_r50_1x_coco.py
@@ -0,0 +1,32 @@
+_base_ = [
+    '../_base_/models/cascade_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            sac=dict(type='SAC', use_deform=True),
+            stage_with_sac=(False, True, True, True),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/configs/detectors/detectors_htc_r101_20e_coco.py b/configs/detectors/detectors_htc_r101_20e_coco.py
new file mode 100755
index 0000000..93d7d2b
--- /dev/null
+++ b/configs/detectors/detectors_htc_r101_20e_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../htc/htc_r101_fpn_20e_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=101,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            sac=dict(type='SAC', use_deform=True),
+            stage_with_sac=(False, True, True, True),
+            pretrained='torchvision://resnet101',
+            style='pytorch')))
diff --git a/configs/detectors/detectors_htc_r50_1x_coco.py b/configs/detectors/detectors_htc_r50_1x_coco.py
new file mode 100755
index 0000000..0d2fc4f
--- /dev/null
+++ b/configs/detectors/detectors_htc_r50_1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            sac=dict(type='SAC', use_deform=True),
+            stage_with_sac=(False, True, True, True),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/configs/detectors/htc_r50_rfp_1x_coco.py b/configs/detectors/htc_r50_rfp_1x_coco.py
new file mode 100755
index 0000000..496104e
--- /dev/null
+++ b/configs/detectors/htc_r50_rfp_1x_coco.py
@@ -0,0 +1,24 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/configs/detectors/htc_r50_sac_1x_coco.py b/configs/detectors/htc_r50_sac_1x_coco.py
new file mode 100755
index 0000000..72d4db9
--- /dev/null
+++ b/configs/detectors/htc_r50_sac_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True)))
diff --git a/configs/detectors/metafile.yml b/configs/detectors/metafile.yml
new file mode 100755
index 0000000..4bed569
--- /dev/null
+++ b/configs/detectors/metafile.yml
@@ -0,0 +1,114 @@
+Collections:
+  - Name: DetectoRS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ASPP
+        - FPN
+        - RFP
+        - RPN
+        - ResNet
+        - RoIAlign
+        - SAC
+    Paper:
+      URL: https://arxiv.org/abs/2006.02334
+      Title: 'DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution'
+    README: configs/detectors/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/backbones/detectors_resnet.py#L205
+      Version: v2.2.0
+
+Models:
+  - Name: cascade_rcnn_r50_rfp_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/cascade_rcnn_r50_rfp_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco-8cf51bfd.pth
+
+  - Name: cascade_rcnn_r50_sac_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/cascade_rcnn_r50_sac_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco-24bfda62.pth
+
+  - Name: detectors_cascade_rcnn_r50_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/detectors_cascade_rcnn_r50_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco-32a10ba0.pth
+
+  - Name: htc_r50_rfp_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/htc_r50_rfp_1x_coco.py
+    Metadata:
+      Training Memory (GB): 11.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco-8ff87c51.pth
+
+  - Name: htc_r50_sac_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/htc_r50_sac_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.3
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco-bfa60c54.pth
+
+  - Name: detectors_htc_r50_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/detectors_htc_r50_1x_coco.py
+    Metadata:
+      Training Memory (GB): 13.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  42.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth
diff --git a/configs/detr/README.md b/configs/detr/README.md
new file mode 100755
index 0000000..9f2485d
--- /dev/null
+++ b/configs/detr/README.md
@@ -0,0 +1,37 @@
+# DETR
+
+> [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143878072-0a7434e4-416b-4315-aeea-a8297f4d6453.png"/>
+</div>
+
+## Results and Models
+
+| Backbone | Model | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                 Config                                                 |                                                                                                                                    Download                                                                                                                                    |
+| :------: | :---: | :-----: | :------: | :------------: | :----: | :----------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | DETR  |  150e   |   7.9    |                |  40.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/detr/detr_r50_8x2_150e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detr/detr_r50_8x2_150e_coco/detr_r50_8x2_150e_coco_20201130_194835-2c4b8974.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detr/detr_r50_8x2_150e_coco/detr_r50_8x2_150e_coco_20201130_194835.log.json) |
+
+## Citation
+
+We provide the config files for DETR: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872).
+
+```latex
+@inproceedings{detr,
+  author    = {Nicolas Carion and
+               Francisco Massa and
+               Gabriel Synnaeve and
+               Nicolas Usunier and
+               Alexander Kirillov and
+               Sergey Zagoruyko},
+  title     = {End-to-End Object Detection with Transformers},
+  booktitle = {ECCV},
+  year      = {2020}
+}
+```
diff --git a/configs/detr/detr_r50_8x2_150e_coco.py b/configs/detr/detr_r50_8x2_150e_coco.py
new file mode 100755
index 0000000..892447d
--- /dev/null
+++ b/configs/detr/detr_r50_8x2_150e_coco.py
@@ -0,0 +1,150 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DETR',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    bbox_head=dict(
+        type='DETRHead',
+        num_classes=80,
+        in_channels=2048,
+        transformer=dict(
+            type='Transformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1)
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DetrTransformerDecoder',
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        dropout=0.1),
+                    feedforward_channels=2048,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')),
+            )),
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            bg_cls_weight=0.1,
+            use_sigmoid=False,
+            loss_weight=1.0,
+            class_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='ClassificationCost', weight=1.),
+            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=100))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=1),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[100])
+runner = dict(type='EpochBasedRunner', max_epochs=150)
diff --git a/configs/detr/metafile.yml b/configs/detr/metafile.yml
new file mode 100755
index 0000000..45622cf
--- /dev/null
+++ b/configs/detr/metafile.yml
@@ -0,0 +1,33 @@
+Collections:
+  - Name: DETR
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://arxiv.org/abs/2005.12872
+      Title: 'End-to-End Object Detection with Transformers'
+    README: configs/detr/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/detectors/detr.py#L7
+      Version: v2.7.0
+
+Models:
+  - Name: detr_r50_8x2_150e_coco
+    In Collection: DETR
+    Config: configs/detr/detr_r50_8x2_150e_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      Epochs: 150
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detr/detr_r50_8x2_150e_coco/detr_r50_8x2_150e_coco_20201130_194835-2c4b8974.pth
diff --git a/configs/double_heads/README.md b/configs/double_heads/README.md
new file mode 100755
index 0000000..4a149b5
--- /dev/null
+++ b/configs/double_heads/README.md
@@ -0,0 +1,32 @@
+# Double Heads
+
+> [Rethinking Classification and Localization for Object Detection](https://arxiv.org/abs/1904.06493)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Two head structures (i.e. fully connected head and convolution head) have been widely used in R-CNN based detectors for classification and localization tasks. However, there is a lack of understanding of how does these two head structures work for these two tasks. To address this issue, we perform a thorough analysis and find an interesting fact that the two head structures have opposite preferences towards the two tasks. Specifically, the fully connected head (fc-head) is more suitable for the classification task, while the convolution head (conv-head) is more suitable for the localization task. Furthermore, we examine the output feature maps of both heads and find that fc-head has more spatial sensitivity than conv-head. Thus, fc-head has more capability to distinguish a complete object from part of an object, but is not robust to regress the whole object. Based upon these findings, we propose a Double-Head method, which has a fully connected head focusing on classification and a convolution head for bounding box regression. Without bells and whistles, our method gains +3.5 and +2.8 AP on MS COCO dataset from Feature Pyramid Network (FPN) baselines with ResNet-50 and ResNet-101 backbones, respectively.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143879010-e30f654b-f93e-44b2-a186-c251fdca5bda.png"/>
+</div>
+
+## Results and Models
+
+| Backbone |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                         Config                                                         |                                                                                                                                                        Download                                                                                                                                                         |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN | pytorch |   1x    |   6.8    |      9.5       |  40.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130_220238.log.json) |
+
+## Citation
+
+```latex
+@article{wu2019rethinking,
+    title={Rethinking Classification and Localization for Object Detection},
+    author={Yue Wu and Yinpeng Chen and Lu Yuan and Zicheng Liu and Lijuan Wang and Hongzhi Li and Yun Fu},
+    year={2019},
+    eprint={1904.06493},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
diff --git a/configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py b/configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..9b8118b
--- /dev/null
+++ b/configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,23 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        type='DoubleHeadRoIHead',
+        reg_roi_scale_factor=1.3,
+        bbox_head=dict(
+            _delete_=True,
+            type='DoubleConvFCBBoxHead',
+            num_convs=4,
+            num_fcs=2,
+            in_channels=256,
+            conv_out_channels=1024,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0))))
diff --git a/configs/double_heads/metafile.yml b/configs/double_heads/metafile.yml
new file mode 100755
index 0000000..6fe9b7a
--- /dev/null
+++ b/configs/double_heads/metafile.yml
@@ -0,0 +1,41 @@
+Collections:
+  - Name: Rethinking Classification and Localization for Object Detection
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/pdf/1904.06493
+      Title: 'Rethinking Classification and Localization for Object Detection'
+    README: configs/double_heads/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/roi_heads/double_roi_head.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: dh_faster_rcnn_r50_fpn_1x_coco
+    In Collection: Rethinking Classification and Localization for Object Detection
+    Config: configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.8
+      inference time (ms/im):
+        - value: 105.26
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth
diff --git a/configs/dyhead/README.md b/configs/dyhead/README.md
new file mode 100755
index 0000000..8e6aed3
--- /dev/null
+++ b/configs/dyhead/README.md
@@ -0,0 +1,52 @@
+# DyHead
+
+> [Dynamic Head: Unifying Object Detection Heads with Attentions](https://arxiv.org/abs/2106.08322)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The complex nature of combining localization and classification in object detection has resulted in the flourished development of methods. Previous works tried to improve the performance in various object detection heads but failed to present a unified view. In this paper, we present a novel dynamic head framework to unify object detection heads with attentions. By coherently combining multiple self-attention mechanisms between feature levels for scale-awareness, among spatial locations for spatial-awareness, and within output channels for task-awareness, the proposed approach significantly improves the representation ability of object detection heads without any computational overhead. Further experiments demonstrate that the effectiveness and efficiency of the proposed dynamic head on the COCO benchmark. With a standard ResNeXt-101-DCN backbone, we largely improve the performance over popular object detectors and achieve a new state-of-the-art at 54.0 AP. Furthermore, with latest transformer backbone and extra data, we can push current best COCO result to a new record at 60.6 AP.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/42844407/149169448-fcafb6d0-b866-41cc-9422-94de9f1e1761.png" height="300"/>
+</div>
+
+## Results and Models
+
+| Method | Backbone |  Style  |   Setting    | Lr schd | Mem (GB) | Inf time (fps) | box AP |                      Config                      |                                                                                                                                                                                      Download                                                                                                                                                                                      |
+| :----: | :------: | :-----: | :----------: | :-----: | :------: | :------------: | :----: | :----------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  ATSS  |   R-50   |  caffe  | reproduction |   1x    |   5.4    |      13.2      |  42.5  | [config](./atss_r50_caffe_fpn_dyhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939.log.json) |
+|  ATSS  |   R-50   | pytorch |    simple    |   1x    |   4.9    |      13.7      |  43.3  |    [config](./atss_r50_fpn_dyhead_1x_coco.py)    |                               [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314.log.json)                               |
+
+- We trained the above models with 4 GPUs and 4 `samples_per_gpu`.
+- The `reproduction` setting aims to reproduce the official implementation based on Detectron2.
+- The `simple` setting serves as a minimum example to use DyHead in MMDetection. Specifically,
+  - it adds `DyHead` to `neck` after `FPN`
+  - it sets `stacked_convs=0` to `bbox_head`
+- The `simple` setting achieves higher AP than the original implementation.
+  We have not conduct ablation study between the two settings.
+  `dict(type='Pad', size_divisor=128)` may further improve AP by prefer spatial alignment across pyramid levels, although large padding reduces efficiency.
+
+We also trained the model with Swin-L backbone. Results are as below.
+
+| Method | Backbone | Style |   Setting    | Lr schd | mstrain  | box AP |                            Config                            |                                                                                                                                                                                    Download                                                                                                                                                                                    |
+| :----: | :------: | :---: | :----------: | :-----: | :------: | :----: | :----------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  ATSS  |  Swin-L  | caffe | reproduction |   2x    | 480~1200 |  56.2  | [config](./atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco_20220509_100315-bc5b6516.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco_20220509_100315.log.json) |
+
+## Relation to Other Methods
+
+- DyHead can be regarded as an improved [SEPC](https://arxiv.org/abs/2005.03101) with [DyReLU modules](https://arxiv.org/abs/2003.10027) and simplified [SE blocks](https://arxiv.org/abs/1709.01507).
+- Xiyang Dai et al., the author team of DyHead, adopt it for [Dynamic DETR](https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Dynamic_DETR_End-to-End_Object_Detection_With_Dynamic_Attention_ICCV_2021_paper.html).
+  The description of Dynamic Encoder in Sec. 3.2 will help you understand DyHead.
+
+## Citation
+
+```latex
+@inproceedings{DyHead_CVPR2021,
+  author    = {Dai, Xiyang and Chen, Yinpeng and Xiao, Bin and Chen, Dongdong and Liu, Mengchen and Yuan, Lu and Zhang, Lei},
+  title     = {Dynamic Head: Unifying Object Detection Heads With Attentions},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year      = {2021}
+}
+```
diff --git a/configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py b/configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py
new file mode 100755
index 0000000..223b653
--- /dev/null
+++ b/configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ATSS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(
+            type='DyHead',
+            in_channels=256,
+            out_channels=256,
+            num_blocks=6,
+            # disable zero_init_offset to follow official implementation
+            zero_init_offset=False)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        pred_kernel_size=1,  # follow DyHead official implementation
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128],
+            center_offset=0.5),  # follow DyHead official implementation
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+
+# use caffe img_norm, size_divisor=128, pillow resize
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=(1333, 800),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=128),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True, backend='pillow'),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=128),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py b/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
new file mode 100755
index 0000000..8c5109d
--- /dev/null
+++ b/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
@@ -0,0 +1,65 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ATSS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(type='DyHead', in_channels=256, out_channels=256, num_blocks=6)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco.py b/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco.py
new file mode 100755
index 0000000..dc9c328
--- /dev/null
+++ b/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco.py
@@ -0,0 +1,164 @@
+_base_ = '../_base_/default_runtime.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+model = dict(
+    type='ATSS',
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[384, 768, 1536],
+            out_channels=256,
+            start_level=0,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(
+            type='DyHead',
+            in_channels=256,
+            out_channels=256,
+            num_blocks=6,
+            # disable zero_init_offset to follow official implementation
+            zero_init_offset=False)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        pred_kernel_size=1,  # follow DyHead official implementation
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128],
+            center_offset=0.5),  # follow DyHead official implementation
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(2000, 480), (2000, 1200)],
+        multiscale_mode='range',
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=128),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2000, 1200),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True, backend='pillow'),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=128),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
+
+# optimizer
+optimizer_config = dict(grad_clip=None)
+optimizer = dict(
+    type='AdamW',
+    lr=0.00005,
+    betas=(0.9, 0.999),
+    weight_decay=0.05,
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/dyhead/metafile.yml b/configs/dyhead/metafile.yml
new file mode 100755
index 0000000..3fb7370
--- /dev/null
+++ b/configs/dyhead/metafile.yml
@@ -0,0 +1,76 @@
+Collections:
+  - Name: DyHead
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 4x T4 GPUs
+      Architecture:
+        - ATSS
+        - DyHead
+        - FPN
+        - ResNet
+        - Deformable Convolution
+        - Pyramid Convolution
+    Paper:
+      URL: https://arxiv.org/abs/2106.08322
+      Title: 'Dynamic Head: Unifying Object Detection Heads with Attentions'
+    README: configs/dyhead/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/necks/dyhead.py#L130
+      Version: v2.22.0
+
+Models:
+  - Name: atss_r50_caffe_fpn_dyhead_1x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.4
+      inference time (ms/im):
+        - value: 75.7
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth
+
+  - Name: atss_r50_fpn_dyhead_1x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.9
+      inference time (ms/im):
+        - value: 73.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth
+
+  - Name: atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco.py
+    Metadata:
+      Training Memory (GB): 58.4
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 56.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco_20220509_100315-bc5b6516.pth
diff --git a/configs/dynamic_rcnn/README.md b/configs/dynamic_rcnn/README.md
new file mode 100755
index 0000000..0045df7
--- /dev/null
+++ b/configs/dynamic_rcnn/README.md
@@ -0,0 +1,30 @@
+# Dynamic R-CNN
+
+> [Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training](https://arxiv.org/abs/2004.06002)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Although two-stage object detectors have continuously advanced the state-of-the-art performance in recent years, the training process itself is far from crystal. In this work, we first point out the inconsistency problem between the fixed network settings and the dynamic training procedure, which greatly affects the performance. For example, the fixed label assignment strategy and regression loss function cannot fit the distribution change of proposals and thus are harmful to training high quality detectors. Consequently, we propose Dynamic R-CNN to adjust the label assignment criteria (IoU threshold) and the shape of regression loss function (parameters of SmoothL1 Loss) automatically based on the statistics of proposals during training. This dynamic design makes better use of the training samples and pushes the detector to fit more high quality samples. Specifically, our method improves upon ResNet-50-FPN baseline with 1.9% AP and 5.5% AP90 on the MS COCO dataset with no extra overhead.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143879518-842f5bec-9f65-4454-93a1-9b3b0c42ec3c.png"/>
+</div>
+
+## Results and Models
+
+| Backbone |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                        Config                                                        |                                                                                                                                      Download                                                                                                                                      |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |   1x    |   3.8    |                |  38.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x_20200618_095048.log.json) |
+
+## Citation
+
+```latex
+@article{DynamicRCNN,
+    author = {Hongkai Zhang and Hong Chang and Bingpeng Ma and Naiyan Wang and Xilin Chen},
+    title = {Dynamic {R-CNN}: Towards High Quality Object Detection via Dynamic Training},
+    journal = {arXiv preprint arXiv:2004.06002},
+    year = {2020}
+}
+```
diff --git a/configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py b/configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..f2deb99
--- /dev/null
+++ b/configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        type='DynamicRoIHead',
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(nms=dict(iou_threshold=0.85)),
+        rcnn=dict(
+            dynamic_rcnn=dict(
+                iou_topk=75,
+                beta_topk=10,
+                update_iter_interval=100,
+                initial_iou=0.4,
+                initial_beta=1.0))),
+    test_cfg=dict(rpn=dict(nms=dict(iou_threshold=0.85))))
diff --git a/configs/dynamic_rcnn/metafile.yml b/configs/dynamic_rcnn/metafile.yml
new file mode 100755
index 0000000..fec43db
--- /dev/null
+++ b/configs/dynamic_rcnn/metafile.yml
@@ -0,0 +1,35 @@
+Collections:
+  - Name: Dynamic R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Dynamic R-CNN
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/pdf/2004.06002
+      Title: 'Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training'
+    README: configs/dynamic_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/roi_heads/dynamic_roi_head.py#L11
+      Version: v2.2.0
+
+Models:
+  - Name: dynamic_rcnn_r50_fpn_1x_coco
+    In Collection: Dynamic R-CNN
+    Config: configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth
diff --git a/configs/efficientnet/README.md b/configs/efficientnet/README.md
new file mode 100755
index 0000000..99b0572
--- /dev/null
+++ b/configs/efficientnet/README.md
@@ -0,0 +1,30 @@
+# EfficientNet
+
+> [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946v5)
+
+<!-- [BACKBONE] -->
+
+## Introduction
+
+Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are available. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet.
+
+To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves state-of-the-art 84.3% top-1 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.
+
+## Results and Models
+
+### RetinaNet
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                             Config                                                              |                                                                                                                                                                              Download                                                                                                                                                                              |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Efficientnet-b3 | pytorch |   1x    |    -     |       -        |  40.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806.log.json) |
+
+## Citation
+
+```latex
+@article{tan2019efficientnet,
+  title={Efficientnet: Rethinking model scaling for convolutional neural networks},
+  author={Tan, Mingxing and Le, Quoc V},
+  journal={arXiv preprint arXiv:1905.11946},
+  year={2019}
+}
+```
diff --git a/configs/efficientnet/metafile.yml b/configs/efficientnet/metafile.yml
new file mode 100755
index 0000000..de40b95
--- /dev/null
+++ b/configs/efficientnet/metafile.yml
@@ -0,0 +1,19 @@
+Models:
+  - Name: retinanet_effb3_fpn_crop896_8x4_1x_coco
+    In Collection: RetinaNet
+    Config: configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth
+    Paper:
+      URL: https://arxiv.org/abs/1905.11946v5
+      Title: 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks'
+    README: configs/efficientnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/backbones/efficientnet.py#L159
+      Version: v2.23.0
diff --git a/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py b/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py
new file mode 100755
index 0000000..c90bc16
--- /dev/null
+++ b/configs/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco.py
@@ -0,0 +1,94 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+
+cudnn_benchmark = True
+norm_cfg = dict(type='BN', requires_grad=True)
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth'  # noqa
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='EfficientNet',
+        arch='b3',
+        drop_path_rate=0.2,
+        out_indices=(3, 4, 5),
+        frozen_stages=0,
+        norm_cfg=dict(
+            type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone', checkpoint=checkpoint)),
+    neck=dict(
+        in_channels=[48, 136, 384],
+        start_level=0,
+        out_channels=256,
+        relu_before_extra_convs=True,
+        no_norm_on_lateral=True,
+        norm_cfg=norm_cfg),
+    bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(neg_iou_thr=0.5)))
+
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_size = (896, 896)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=img_size,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=img_size),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=img_size),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=img_size,
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size=img_size),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer_config = dict(grad_clip=None)
+optimizer = dict(
+    type='SGD',
+    lr=0.04,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.1,
+    step=[8, 11])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=12)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (4 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/configs/empirical_attention/README.md b/configs/empirical_attention/README.md
new file mode 100755
index 0000000..fc2620a
--- /dev/null
+++ b/configs/empirical_attention/README.md
@@ -0,0 +1,33 @@
+# Empirical Attention
+
+> [An Empirical Study of Spatial Attention Mechanisms in Deep Networks](https://arxiv.org/abs/1904.05873)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Attention mechanisms have become a popular component in deep neural networks, yet there has been little examination of how different influencing factors and methods for computing attention from these factors affect performance. Toward a better general understanding of attention mechanisms, we present an empirical study that ablates various spatial attention elements within a generalized attention formulation, encompassing the dominant Transformer attention as well as the prevalent deformable convolution and dynamic convolution modules. Conducted on a variety of applications, the study yields significant findings about spatial attention in deep networks, some of which run counter to conventional understanding. For example, we find that the query and key content comparison in Transformer attention is negligible for self-attention, but vital for encoder-decoder attention. A proper combination of deformable convolution with key content only saliency achieves the best accuracy-efficiency tradeoff in self-attention. Our results suggest that there exists much room for improvement in the design of attention mechanisms.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143879619-f1817da9-1573-45c9-891d-cfe55ad54911.png"/>
+</div>
+
+## Results and Models
+
+| Backbone | Attention Component | DCN | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                    Config                                                                     |                                                                                                                                                                                               Download                                                                                                                                                                                                |
+| :------: | :-----------------: | :-: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   |        1111         |  N  |   1x    |   8.0    |      13.8      |  40.0  |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py)   |         [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130_210344.log.json)         |
+|   R-50   |        0010         |  N  |   1x    |   4.2    |      18.4      |  39.1  |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py)   |         [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130-7cb0c14d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130_210125.log.json)         |
+|   R-50   |        1111         |  Y  |   1x    |   8.0    |      12.7      |  42.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130-8b2523a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130_204442.log.json) |
+|   R-50   |        0010         |  Y  |   1x    |   4.2    |      17.1      |  42.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130-1a2e831d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130_210410.log.json) |
+
+## Citation
+
+```latex
+@article{zhu2019empirical,
+  title={An Empirical Study of Spatial Attention Mechanisms in Deep Networks},
+  author={Zhu, Xizhou and Cheng, Dazhi and Zhang, Zheng and Lin, Stephen and Dai, Jifeng},
+  journal={arXiv preprint arXiv:1904.05873},
+  year={2019}
+}
+```
diff --git a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py
new file mode 100755
index 0000000..a544e3a
--- /dev/null
+++ b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            stages=(False, False, True, True),
+            position='after_conv2')
+    ]))
diff --git a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py
new file mode 100755
index 0000000..bbefd27
--- /dev/null
+++ b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        plugins=[
+            dict(
+                cfg=dict(
+                    type='GeneralizedAttention',
+                    spatial_range=-1,
+                    num_heads=8,
+                    attention_type='0010',
+                    kv_stride=2),
+                stages=(False, False, True, True),
+                position='after_conv2')
+        ],
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py
new file mode 100755
index 0000000..13a4645
--- /dev/null
+++ b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='1111',
+                kv_stride=2),
+            stages=(False, False, True, True),
+            position='after_conv2')
+    ]))
diff --git a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py
new file mode 100755
index 0000000..b1f26c0
--- /dev/null
+++ b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        plugins=[
+            dict(
+                cfg=dict(
+                    type='GeneralizedAttention',
+                    spatial_range=-1,
+                    num_heads=8,
+                    attention_type='1111',
+                    kv_stride=2),
+                stages=(False, False, True, True),
+                position='after_conv2')
+        ],
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/configs/empirical_attention/metafile.yml b/configs/empirical_attention/metafile.yml
new file mode 100755
index 0000000..923bcb2
--- /dev/null
+++ b/configs/empirical_attention/metafile.yml
@@ -0,0 +1,103 @@
+Collections:
+  - Name: Empirical Attention
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Deformable Convolution
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+        - Spatial Attention
+    Paper:
+      URL: https://arxiv.org/pdf/1904.05873
+      Title: 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+    README: configs/empirical_attention/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/generalized_attention.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_attention_1111_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      inference time (ms/im):
+        - value: 72.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth
+
+  - Name: faster_rcnn_r50_fpn_attention_0010_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 54.35
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130-7cb0c14d.pth
+
+  - Name: faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      inference time (ms/im):
+        - value: 78.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130-8b2523a6.pth
+
+  - Name: faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 58.48
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130-1a2e831d.pth
diff --git a/configs/fast_rcnn/README.md b/configs/fast_rcnn/README.md
new file mode 100755
index 0000000..767f76c
--- /dev/null
+++ b/configs/fast_rcnn/README.md
@@ -0,0 +1,73 @@
+# Fast R-CNN
+
+> [Fast R-CNN](https://arxiv.org/abs/1504.08083)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143882189-6258c05c-f2a1-4320-9282-7e2f2d502eb2.png"/>
+</div>
+
+## Introduction
+
+Before training the Fast R-CNN, users should first train an [RPN](../rpn/README.md), and use the RPN to extract the region proposals.
+
+- Firstly, extract the region proposals of the val set by this command as below:
+
+```bash
+./tools/dist_test.sh \
+    configs/rpn_r50_fpn_1x_coco.py \
+    checkpoints/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth \
+    8 \
+    --out proposals/rpn_r50_fpn_1x_val2017.pkl
+```
+
+- Then, change the `ann_file` and `img_prefix` of `data.test` in the RPN config to train set as below:
+
+```python
+data = dict(
+    test=dict(
+        ann_file='data/coco/annotations/instances_train2017.json',
+        img_prefix='data/coco/train2017/'))
+```
+
+- Extract the region proposals of the train set by this command as below:
+
+```bash
+./tools/dist_test.sh \
+    configs/rpn_r50_fpn_1x_coco.py \
+    checkpoints/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth \
+    8 \
+    --out proposals/rpn_r50_fpn_1x_train2017.pkl
+```
+
+- Modify the path of `proposal_file` in Fast R-CNN config as below:
+
+```python
+data = dict(
+    train=dict(
+        proposal_file='proposals/rpn_r50_fpn_1x_train2017.pkl'),
+    val=dict(
+        proposal_file='proposals/rpn_r50_fpn_1x_val2017.pkl'),
+    test=dict(
+        proposal_file='proposals/rpn_r50_fpn_1x_val2017.pkl'))
+```
+
+Finally, users can start training the Fast R-CNN.
+
+## Results and Models
+
+## Citation
+
+```latex
+@inproceedings{girshick2015fast,
+  title={Fast r-cnn},
+  author={Girshick, Ross},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  year={2015}
+}
+```
diff --git a/configs/fast_rcnn/fast_rcnn_r101_caffe_fpn_1x_coco.py b/configs/fast_rcnn/fast_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..3ab8e98
--- /dev/null
+++ b/configs/fast_rcnn/fast_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './fast_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/fast_rcnn/fast_rcnn_r101_fpn_1x_coco.py b/configs/fast_rcnn/fast_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..83852b2
--- /dev/null
+++ b/configs/fast_rcnn/fast_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fast_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/fast_rcnn/fast_rcnn_r101_fpn_2x_coco.py b/configs/fast_rcnn/fast_rcnn_r101_fpn_2x_coco.py
new file mode 100755
index 0000000..c220885
--- /dev/null
+++ b/configs/fast_rcnn/fast_rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fast_rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/fast_rcnn/fast_rcnn_r50_caffe_fpn_1x_coco.py b/configs/fast_rcnn/fast_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..f1b29ef
--- /dev/null
+++ b/configs/fast_rcnn/fast_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,48 @@
+_base_ = './fast_rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=2000),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=None),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='ToTensor', keys=['proposals']),
+            dict(
+                type='ToDataContainer',
+                fields=[dict(key='proposals', stack=False)]),
+            dict(type='Collect', keys=['img', 'proposals']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py b/configs/fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..d2f080e
--- /dev/null
+++ b/configs/fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,52 @@
+_base_ = [
+    '../_base_/models/fast_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=2000),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=None),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='ToTensor', keys=['proposals']),
+            dict(
+                type='ToDataContainer',
+                fields=[dict(key='proposals', stack=False)]),
+            dict(type='Collect', keys=['img', 'proposals']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl',
+        pipeline=train_pipeline),
+    val=dict(
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline),
+    test=dict(
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline))
diff --git a/configs/fast_rcnn/fast_rcnn_r50_fpn_2x_coco.py b/configs/fast_rcnn/fast_rcnn_r50_fpn_2x_coco.py
new file mode 100755
index 0000000..228e856
--- /dev/null
+++ b/configs/fast_rcnn/fast_rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,5 @@
+_base_ = './fast_rcnn_r50_fpn_1x_coco.py'
+
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/faster_rcnn/README.md b/configs/faster_rcnn/README.md
new file mode 100755
index 0000000..47c8ec7
--- /dev/null
+++ b/configs/faster_rcnn/README.md
@@ -0,0 +1,88 @@
+# Faster R-CNN
+
+> [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143881188-ab87720f-5059-4b4e-a928-b540fb8fb84d.png" height="300"/>
+</div>
+
+## Results and Models
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                          Config                                                           |                                                                                                                                                                          Download                                                                                                                                                                           |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     R-50-C4     |  caffe  |   1x    |    -     |       -        |  35.6  |  [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py)  |            [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152-3f885b85.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152.log.json)             |
+|    R-50-DC5     |  caffe  |   1x    |    -     |       -        |  37.2  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py)  |          [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909-531f0f43.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909.log.json)           |
+|    R-50-FPN     |  caffe  |   1x    |   3.8    |                |  37.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_20200504_180032.log.json)   |
+|    R-50-FPN     | pytorch |   1x    |   4.0    |      21.4      |  37.4  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py)     |                          [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json)                          |
+| R-50-FPN (FP16) | pytorch |   1x    |   3.4    |      28.8      |  37.5  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py)  |                       [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204-d4dc1471.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204_143530.log.json)                       |
+|    R-50-FPN     | pytorch |   2x    |    -     |       -        |  38.4  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_20200504_210434.log.json)               |
+|    R-101-FPN    |  caffe  |   1x    |   5.7    |                |  39.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.398_20200504_180057-b269e9dd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_20200504_180057.log.json) |
+|    R-101-FPN    | pytorch |   1x    |   6.0    |      15.6      |  39.4  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py)    |                        [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130-f513f705.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130_204655.log.json)                        |
+|    R-101-FPN    | pytorch |   2x    |    -     |       -        |  39.8  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_20200504_210455.log.json)             |
+| X-101-32x4d-FPN | pytorch |   1x    |   7.2    |      13.8      |  41.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py) |            [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203-cff10310.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203_000520.log.json)            |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |       -        |  41.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.412_20200506_041400-64a12c0b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_20200506_041400.log.json) |
+| X-101-64x4d-FPN | pytorch |   1x    |   10.3   |      9.4       |  42.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py) |            [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204-833ee192.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204_134340.log.json)            |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |       -        |  41.6  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py) |        [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033-5961fa95.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033.log.json)         |
+
+## Different regression loss
+
+We trained with R-50-FPN pytorch style backbone for 1x schedule.
+
+| Backbone |   Loss type    | Mem (GB) | Inf time (fps) | box AP |                                                             Config                                                             |                                                                                                                                                             Download                                                                                                                                                             |
+| :------: | :------------: | :------: | :------------: | :----: | :----------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN |     L1Loss     |   4.0    |      21.4      |  37.4  |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py)       |            [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json)             |
+| R-50-FPN |    IoULoss     |          |                |  37.9  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py)     | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954-938e81f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954.log.json) |
+| R-50-FPN |    GIoULoss    |          |                |  37.6  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py)     |            [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco_20200505_161120.log.json)            |
+| R-50-FPN | BoundedIoULoss |          |                |  37.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py) |     [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco-98ad993b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco_20200505_160738.log.json)     |
+
+## Pre-trained Models
+
+We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks.
+
+|                              Backbone                              |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                              Config                                                               |                                                                                                                                                                                        Download                                                                                                                                                                                         |
+| :----------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      [R-50-C4](./faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py)      |  caffe  |   1x    |    -     |                |  35.9  |  [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py)  |          [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527-db276fed.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527.log.json)           |
+|     [R-50-DC5](./faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py)     |  caffe  |   1x    |    -     |                |  37.4  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py)  |        [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851-b33d21b9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851.log.json)         |
+|     [R-50-DC5](./faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py)     |  caffe  |   3x    |    -     |                |  38.7  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py)  |        [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107-34a53b2c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107.log.json)         |
+|     [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py)     |  caffe  |   2x    |   3.7    |                |  39.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_bbox_mAP-0.397_20200504_231813-10b2de58.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_20200504_231813.log.json) |
+|     [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py)     |  caffe  |   3x    |   3.7    |                |  39.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py)  |        [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054-1f77628b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054.log.json)         |
+|        [R-50-FPN](./faster_rcnn_r50_fpn_mstrain_3x_coco.py)        | pytorch |   3x    |   3.9    |                |  40.3  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py)     |                    [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822.log.json)                     |
+|    [R-101-FPN](./faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py)    |  caffe  |   3x    |   5.6    |                |  42.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742.log.json)       |
+|       [R-101-FPN](./faster_rcnn_r101_fpn_mstrain_3x_coco.py)       | pytorch |   3x    |   5.8    |                |  41.8  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py)    |                  [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822-4d4d2ca8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822.log.json)                   |
+| [X-101-32x4d-FPN](./faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py) | pytorch |   3x    |   7.0    |                |  42.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151-16b9b260.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151.log.json)       |
+| [X-101-32x8d-FPN](./faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py) | pytorch |   3x    |   10.1   |                |  42.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954-002e082a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954.log.json)       |
+| [X-101-64x4d-FPN](./faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py) | pytorch |   3x    |   10.0   |                |  43.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528.log.json)       |
+
+We further finetune some pre-trained models on the COCO subsets, which only contain only a few of the 80 categories.
+
+| Backbone                                                                      | Style | Class name         | Pre-traind model                                                    | Mem (GB) | box AP | Config                                                                      | Download                                                                                                                                                                                                                                                                                                                                                                                     |
+| ----------------------------------------------------------------------------- | ----- | ------------------ | ------------------------------------------------------------------- | -------- | ------ | --------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py)             | caffe | person             | [R-50-FPN-Caffe-3x](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | 3.7      | 55.8   | [config](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py)             | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929.log.json)                                                 |
+| [R-50-FPN](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py) | caffe | person-bicycle-car | [R-50-FPN-Caffe-3x](./faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py) | 3.7      | 44.1   | [config](./faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117-6eda6d92.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117.log.json) |
+
+## Torchvision New Receipe (TNR)
+
+Torchvision released its high-precision ResNet models. The training details can be found on the [Pytorch website](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/). Here, we have done grid searches on learning rate and weight decay and found the optimal hyper-parameter on the detection task.
+
+|                         Backbone                          |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                             Config                                                              |                                                                                                                                                                               Download                                                                                                                                                                               |
+| :-------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [R-50-TNR](./faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py) | pytorch |   1x    |    -     |                |  40.2  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147.log.json) |
+
+## Citation
+
+```latex
+@article{Ren_2017,
+   title={Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks},
+   journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
+   year={2017},
+   month={Jun},
+}
+```
diff --git a/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..c6f078c
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './faster_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..6a13fe9
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py
@@ -0,0 +1,49 @@
+_base_ = 'faster_rcnn_r50_fpn_mstrain_3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..1de53a6
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py b/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py
new file mode 100755
index 0000000..0d41599
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster_rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..0b498bb
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'faster_rcnn_r50_fpn_mstrain_3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py
new file mode 100755
index 0000000..b071962
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_caffe_c4.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py
new file mode 100755
index 0000000..f4d83e6
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py
@@ -0,0 +1,38 @@
+_base_ = './faster_rcnn_r50_caffe_c4_1x_coco.py'
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py
new file mode 100755
index 0000000..ee2010c
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py
@@ -0,0 +1,37 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_caffe_dc5.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py
new file mode 100755
index 0000000..14eaef2
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py
@@ -0,0 +1,42 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_caffe_dc5.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py
new file mode 100755
index 0000000..403747f
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py'
+# learning policy
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..56c01bd
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,41 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_90k_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_90k_coco.py
new file mode 100755
index 0000000..b5aea6a
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_90k_coco.py
@@ -0,0 +1,15 @@
+_base_ = 'faster_rcnn_r50_caffe_fpn_1x_coco.py'
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[60000, 80000])
+
+# Runner type
+runner = dict(_delete_=True, type='IterBasedRunner', max_iters=90000)
+
+checkpoint_config = dict(interval=10000)
+evaluation = dict(interval=10000, metric='bbox')
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py
new file mode 100755
index 0000000..4f1f376
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person-bicycle-car.py
@@ -0,0 +1,9 @@
+_base_ = './faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py'
+model = dict(roi_head=dict(bbox_head=dict(num_classes=3)))
+classes = ('person', 'bicycle', 'car')
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth'  # noqa
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py
new file mode 100755
index 0000000..b5dfb4f
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py
@@ -0,0 +1,9 @@
+_base_ = './faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py'
+model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))
+classes = ('person', )
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth'  # noqa
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py
new file mode 100755
index 0000000..f807a19
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py
@@ -0,0 +1,46 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..df58973
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..9eeaace
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py
@@ -0,0 +1,47 @@
+_base_ = 'faster_rcnn_r50_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_90k_coco.py b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_90k_coco.py
new file mode 100755
index 0000000..74dca24
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_90k_coco.py
@@ -0,0 +1,15 @@
+_base_ = 'faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py'
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[60000, 80000])
+
+# Runner type
+runner = dict(_delete_=True, type='IterBasedRunner', max_iters=90000)
+
+checkpoint_config = dict(interval=10000)
+evaluation = dict(interval=10000, metric='bbox')
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..009bd93
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py
new file mode 100755
index 0000000..e77a7fa
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py
new file mode 100755
index 0000000..648081f
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='BoundedIoULoss', loss_weight=10.0))))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_ciou_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_ciou_1x_coco.py
new file mode 100755
index 0000000..886d566
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_ciou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='CIoULoss', loss_weight=12.0))))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py
new file mode 100755
index 0000000..acd4040
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+# fp16 settings
+fp16 = dict(loss_scale=512.)
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py
new file mode 100755
index 0000000..5556c49
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0))))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py
new file mode 100755
index 0000000..ddf663e
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='IoULoss', loss_weight=10.0))))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..faf8f92
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py
@@ -0,0 +1,3 @@
+_base_ = [
+    '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py'
+]
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py
new file mode 100755
index 0000000..f897e7c
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(train_cfg=dict(rcnn=dict(sampler=dict(type='OHEMSampler'))))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py
new file mode 100755
index 0000000..759ae3a
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='soft_nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py
new file mode 100755
index 0000000..ecbfb92
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.pytorch.org/models/resnet50-11ad3fa6.pth'
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)))
+
+# `lr` and `weight_decay` have been searched to be optimal.
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.1,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..3808c9f
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py b/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..e93f5d8
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster_rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..f55985d
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py'
+]
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..a5d5aeb
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py
@@ -0,0 +1,62 @@
+_base_ = [
+    '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py'
+]
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+# ResNeXt-101-32x8d model trained with Caffe2 at FB,
+# so the mean and std need to be changed.
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675],
+    std=[57.375, 57.120, 58.395],
+    to_rgb=False)
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..8bf2b65
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py b/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..7ea9b2d
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster_rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py b/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..80397f4
--- /dev/null
+++ b/configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py'
+]
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/faster_rcnn/metafile.yml b/configs/faster_rcnn/metafile.yml
new file mode 100755
index 0000000..3011b15
--- /dev/null
+++ b/configs/faster_rcnn/metafile.yml
@@ -0,0 +1,452 @@
+Collections:
+  - Name: Faster R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - RPN
+        - ResNet
+        - RoIPool
+    Paper:
+      URL: https://arxiv.org/abs/1506.01497
+      Title: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"
+    README: configs/faster_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/faster_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: faster_rcnn_r50_caffe_c4_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152-3f885b85.pth
+
+  - Name: faster_rcnn_r50_caffe_c4_mstrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527-db276fed.pth
+
+  - Name: faster_rcnn_r50_caffe_dc5_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909-531f0f43.pth
+
+  - Name: faster_rcnn_r50_caffe_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth
+
+  - Name: faster_rcnn_r50_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 46.73
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
+
+  - Name: faster_rcnn_r50_fpn_fp16_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_fp16_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.4
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      inference time (ms/im):
+        - value: 34.72
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204-d4dc1471.pth
+
+  - Name: faster_rcnn_r50_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 46.73
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth
+
+  - Name: faster_rcnn_r101_caffe_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.398_20200504_180057-b269e9dd.pth
+
+  - Name: faster_rcnn_r101_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 64.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130-f513f705.pth
+
+  - Name: faster_rcnn_r101_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 64.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth
+
+  - Name: faster_rcnn_x101_32x4d_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      inference time (ms/im):
+        - value: 72.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203-cff10310.pth
+
+  - Name: faster_rcnn_x101_32x4d_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      inference time (ms/im):
+        - value: 72.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.412_20200506_041400-64a12c0b.pth
+
+  - Name: faster_rcnn_x101_64x4d_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 106.38
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204-833ee192.pth
+
+  - Name: faster_rcnn_x101_64x4d_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 106.38
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033-5961fa95.pth
+
+  - Name: faster_rcnn_r50_fpn_iou_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    # re-release
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954-938e81f0.pth
+
+  - Name: faster_rcnn_r50_fpn_giou_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_giou_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth
+
+  - Name: faster_rcnn_r50_fpn_bounded_iou_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_bounded_iou_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco-98ad993b.pth
+
+  - Name: faster_rcnn_r50_caffe_dc5_mstrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851-b33d21b9.pth
+
+  - Name: faster_rcnn_r50_caffe_dc5_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107-34a53b2c.pth
+
+  - Name: faster_rcnn_r50_caffe_fpn_mstrain_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_bbox_mAP-0.397_20200504_231813-10b2de58.pth
+
+  - Name: faster_rcnn_r50_caffe_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054-1f77628b.pth
+
+  - Name: faster_rcnn_r50_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth
+
+  - Name: faster_rcnn_r101_caffe_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth
+
+  - Name: faster_rcnn_r101_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.8
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822-4d4d2ca8.pth
+
+  - Name: faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151-16b9b260.pth
+
+  - Name: faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954-002e082a.pth
+
+  - Name: faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth
+
+  - Name: faster_rcnn_r50_fpn_tnr-pretrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 46.73
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth
diff --git a/configs/fcos/README.md b/configs/fcos/README.md
new file mode 100755
index 0000000..76be365
--- /dev/null
+++ b/configs/fcos/README.md
@@ -0,0 +1,45 @@
+# FCOS
+
+> [FCOS: Fully Convolutional One-Stage Object Detection](https://arxiv.org/abs/1904.01355)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We propose a fully convolutional one-stage object detector (FCOS) to solve object detection in a per-pixel prediction fashion, analogue to semantic segmentation. Almost all state-of-the-art object detectors such as RetinaNet, SSD, YOLOv3, and Faster R-CNN rely on pre-defined anchor boxes. In contrast, our proposed detector FCOS is anchor box free, as well as proposal free. By eliminating the predefined set of anchor boxes, FCOS completely avoids the complicated computation related to anchor boxes such as calculating overlapping during training. More importantly, we also avoid all hyper-parameters related to anchor boxes, which are often very sensitive to the final detection performance. With the only post-processing non-maximum suppression (NMS), FCOS with ResNeXt-64x4d-101 achieves 44.7% in AP with single-model and single-scale testing, surpassing previous one-stage detectors with the advantage of being much simpler. For the first time, we demonstrate a much simpler and flexible detection framework achieving improved detection accuracy. We hope that the proposed FCOS framework can serve as a simple and strong alternative for many other instance-level tasks.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143882011-45b234bc-d04b-4bbe-a822-94bec057ac86.png"/>
+</div>
+
+## Results and Models
+
+| Backbone | Style | GN  | MS train | Tricks | DCN | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                         Config                                                                          |                                                                                                                                                                                          Download                                                                                                                                                                                          |
+| :------: | :---: | :-: | :------: | :----: | :-: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | caffe |  Y  |    N     |   N    |  N  |   1x    |   3.6    |      22.7      |  36.6  |                   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py)                    |                                                        [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/20201227_180009.log.json)                                                         |
+|   R-50   | caffe |  Y  |    N     |   Y    |  N  |   1x    |   3.7    |       -        |  38.7  |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py)   |       [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/20210105_135818.log.json)       |
+|   R-50   | caffe |  Y  |    N     |   Y    |  Y  |   1x    |   3.8    |       -        |  42.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco-ae4d8b3d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/20210105_224556.log.json) |
+|  R-101   | caffe |  Y  |    N     |   N    |  N  |   1x    |   5.5    |      17.3      |  39.1  |                   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py)                   |                                                       [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/fcos_r101_caffe_fpn_gn-head_1x_coco-0e37b982.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/20210103_155046.log.json)                                                       |
+
+| Backbone |  Style  | GN  | MS train | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                               Config                                                                |                                                                                                                                                            Download                                                                                                                                                            |
+| :------: | :-----: | :-: | :------: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   |  caffe  |  Y  |    Y     |   2x    |   2.6    |      22.9      |  38.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py)  |  [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco-d92ceeea.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/20201227_161900.log.json)   |
+|  R-101   |  caffe  |  Y  |    Y     |   2x    |   5.5    |      17.3      |  40.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco-511424d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/20210103_155046.log.json) |
+|  X-101   | pytorch |  Y  |    Y     |   2x    |   10.0   |      9.7       |  42.6  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco-ede514a8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/20210114_133041.log.json) |
+
+**Notes:**
+
+- The X-101 backbone is X-101-64x4d.
+- Tricks means setting `norm_on_bbox`, `centerness_on_reg`, `center_sampling` as `True`.
+- DCN means using `DCNv2` in both backbone and head.
+
+## Citation
+
+```latex
+@article{tian2019fcos,
+  title={FCOS: Fully Convolutional One-Stage Object Detection},
+  author={Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong},
+  journal={arXiv preprint arXiv:1904.01355},
+  year={2019}
+}
+```
diff --git a/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
new file mode 100755
index 0000000..2699bdb
--- /dev/null
+++ b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
@@ -0,0 +1,54 @@
+_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    bbox_head=dict(
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        dcn_on_last_conv=False,
+        center_sampling=True,
+        conv_bias=True,
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
+    # training and testing settings
+    test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6)))
+
+# dataset settings
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+optimizer_config = dict(_delete_=True, grad_clip=None)
+
+lr_config = dict(warmup='linear')
diff --git a/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py
new file mode 100755
index 0000000..cf93c91
--- /dev/null
+++ b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py
@@ -0,0 +1,56 @@
+_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    bbox_head=dict(
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        dcn_on_last_conv=True,
+        center_sampling=True,
+        conv_bias=True,
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
+    # training and testing settings
+    test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6)))
+
+# dataset settings
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+optimizer_config = dict(_delete_=True, grad_clip=None)
+
+lr_config = dict(warmup='linear')
diff --git a/configs/fcos/fcos_center_r50_caffe_fpn_gn-head_1x_coco.py b/configs/fcos/fcos_center_r50_caffe_fpn_gn-head_1x_coco.py
new file mode 100755
index 0000000..9f502e7
--- /dev/null
+++ b/configs/fcos/fcos_center_r50_caffe_fpn_gn-head_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+model = dict(bbox_head=dict(center_sampling=True, center_sample_radius=1.5))
diff --git a/configs/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py b/configs/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py
new file mode 100755
index 0000000..45bea48
--- /dev/null
+++ b/configs/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet101_caffe')))
diff --git a/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
new file mode 100755
index 0000000..f4d36f1
--- /dev/null
+++ b/configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
@@ -0,0 +1,47 @@
+_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet101_caffe')))
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py
new file mode 100755
index 0000000..955787b
--- /dev/null
+++ b/configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py
@@ -0,0 +1,106 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='FCOS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='constant',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py
new file mode 100755
index 0000000..2816b16
--- /dev/null
+++ b/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -0,0 +1,4 @@
+# TODO: Remove this config after benchmarking all related configs
+_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+
+data = dict(samples_per_gpu=4, workers_per_gpu=4)
diff --git a/configs/fcos/fcos_r50_caffe_fpn_gn-head_fp16_1x_bs8x8_coco.py b/configs/fcos/fcos_r50_caffe_fpn_gn-head_fp16_1x_bs8x8_coco.py
new file mode 100755
index 0000000..f7c973c
--- /dev/null
+++ b/configs/fcos/fcos_r50_caffe_fpn_gn-head_fp16_1x_bs8x8_coco.py
@@ -0,0 +1,13 @@
+_base_ = ['./fcos_r50_caffe_fpn_gn-head_1x_coco.py']
+
+data = dict(samples_per_gpu=8, workers_per_gpu=8)
+
+# optimizer
+optimizer = dict(lr=0.04)
+fp16 = dict(loss_scale='dynamic')
+
+# learning policy
+# In order to avoid non-convergence in the early stage of
+# mixed-precision training, the warmup in the lr_config is set to linear,
+# warmup_iters increases and warmup_ratio decreases.
+lr_config = dict(warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 10)
diff --git a/configs/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
new file mode 100755
index 0000000..497d03f
--- /dev/null
+++ b/configs/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
@@ -0,0 +1,39 @@
+_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py b/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py
new file mode 100755
index 0000000..e70e465
--- /dev/null
+++ b/configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py
@@ -0,0 +1,60 @@
+_base_ = './fcos_r50_caffe_fpn_gn-head_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/fcos/metafile.yml b/configs/fcos/metafile.yml
new file mode 100755
index 0000000..ae922eb
--- /dev/null
+++ b/configs/fcos/metafile.yml
@@ -0,0 +1,146 @@
+Collections:
+  - Name: FCOS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Group Normalization
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.01355
+      Title: 'FCOS: Fully Convolutional One-Stage Object Detection'
+    README: configs/fcos/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fcos.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: fcos_r50_caffe_fpn_gn-head_1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.6
+      inference time (ms/im):
+        - value: 44.05
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth
+
+  - Name: fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth
+
+  - Name: fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco-ae4d8b3d.pth
+
+  - Name: fcos_r101_caffe_fpn_gn-head_1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 57.8
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/fcos_r101_caffe_fpn_gn-head_1x_coco-0e37b982.pth
+
+  - Name: fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
+    Metadata:
+      Training Memory (GB): 2.6
+      inference time (ms/im):
+        - value: 43.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco-d92ceeea.pth
+
+  - Name: fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 57.8
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco-511424d6.pth
+
+  - Name: fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      inference time (ms/im):
+        - value: 103.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco-ede514a8.pth
diff --git a/configs/foveabox/README.md b/configs/foveabox/README.md
new file mode 100755
index 0000000..7fcd094
--- /dev/null
+++ b/configs/foveabox/README.md
@@ -0,0 +1,53 @@
+# FoveaBox
+
+> [FoveaBox: Beyond Anchor-based Object Detector](https://arxiv.org/abs/1904.03797)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present FoveaBox, an accurate, flexible, and completely anchor-free framework for object detection. While almost all state-of-the-art object detectors utilize predefined anchors to enumerate possible locations, scales and aspect ratios for the search of the objects, their performance and generalization ability are also limited to the design of anchors. Instead, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object. The scales of target boxes are naturally associated with feature pyramid representations. In FoveaBox, an instance is assigned to adjacent feature levels to make the model more accurate.We demonstrate its effectiveness on standard benchmarks and report extensive experimental analysis. Without bells and whistles, FoveaBox achieves state-of-the-art single model performance on the standard COCO and Pascal VOC object detection benchmark. More importantly, FoveaBox avoids all computation and hyper-parameters related to anchor boxes, which are often sensitive to the final detection performance. We believe the simple and effective approach will serve as a solid baseline and help ease future research for object detection.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143885497-332d38a7-b492-4f51-b9d2-ef9d4ad4412a.png"/>
+</div>
+
+## Introduction
+
+FoveaBox is an accurate, flexible and completely anchor-free object detection system for object detection framework, as presented in our paper [https://arxiv.org/abs/1904.03797](https://arxiv.org/abs/1904.03797):
+Different from previous anchor-based methods, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object.
+
+## Results and Models
+
+### Results on R50/101-FPN
+
+| Backbone |  Style  | align | ms-train | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                    Config                                                                    |                                                                                                                                                                                                        Download                                                                                                                                                                                                         |
+| :------: | :-----: | :---: | :------: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |   N   |    N     |   1x    |   5.6    |      24.1      |  36.5  |                [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py)                 |                                                               [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219-ee4d5303.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219_223025.log.json)                                                               |
+|   R-50   | pytorch |   N   |    N     |   2x    |   5.6    |       -        |  37.2  |                [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r50_fpn_4x4_2x_coco.py)                 |                                                               [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203-2df792b1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203_112043.log.json)                                                               |
+|   R-50   | pytorch |   Y   |    N     |   2x    |   8.1    |      19.4      |  37.9  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py)          |                                   [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203_134252.log.json)                                   |
+|   R-50   | pytorch |   Y   |    Y     |   2x    |   8.1    |      18.3      |  40.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205-85ce26cb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205_112557.log.json)   |
+|  R-101   | pytorch |   N   |    N     |   1x    |   9.2    |      17.4      |  38.6  |                [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r101_fpn_4x4_1x_coco.py)                |                                                             [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219-05e38f1c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219_011740.log.json)                                                             |
+|  R-101   | pytorch |   N   |    N     |   2x    |   11.7   |       -        |  40.0  |                [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_r101_fpn_4x4_2x_coco.py)                |                                                             [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208-02320ea4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208_202059.log.json)                                                             |
+|  R-101   | pytorch |   Y   |    N     |   2x    |   11.7   |      14.7      |  40.0  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py)         |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208-c39a027a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208_203337.log.json)                                 |
+|  R-101   | pytorch |   Y   |    Y     |   2x    |   11.7   |      14.7      |  42.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208-649c5eb6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208_202124.log.json) |
+
+\[1\] *1x and 2x mean the model is trained for 12 and 24 epochs, respectively.* \
+\[2\] *Align means utilizing deformable convolution to align the cls branch.* \
+\[3\] *All results are obtained with a single model and without any test time data augmentation.*\
+\[4\] *We use 4 GPUs for training.*
+
+Any pull requests or issues are welcome.
+
+## Citation
+
+Please consider citing our paper in your publications if the project helps your research. BibTeX reference is as follows.
+
+```latex
+@article{kong2019foveabox,
+  title={FoveaBox: Beyond Anchor-based Object Detector},
+  author={Kong, Tao and Sun, Fuchun and Liu, Huaping and Jiang, Yuning and Shi, Jianbo},
+  journal={arXiv preprint arXiv:1904.03797},
+  year={2019}
+}
+```
diff --git a/configs/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py b/configs/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py
new file mode 100755
index 0000000..c5d1784
--- /dev/null
+++ b/configs/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py
@@ -0,0 +1,12 @@
+_base_ = './fovea_r50_fpn_4x4_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py
new file mode 100755
index 0000000..cc5affe
--- /dev/null
+++ b/configs/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py
@@ -0,0 +1,29 @@
+_base_ = './fovea_r50_fpn_4x4_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py b/configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py
new file mode 100755
index 0000000..e7265bc
--- /dev/null
+++ b/configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py
@@ -0,0 +1,10 @@
+_base_ = './fovea_r50_fpn_4x4_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py
new file mode 100755
index 0000000..8fc39be
--- /dev/null
+++ b/configs/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py
@@ -0,0 +1,25 @@
+_base_ = './fovea_r50_fpn_4x4_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/foveabox/fovea_r101_fpn_4x4_1x_coco.py b/configs/foveabox/fovea_r101_fpn_4x4_1x_coco.py
new file mode 100755
index 0000000..9201af1
--- /dev/null
+++ b/configs/foveabox/fovea_r101_fpn_4x4_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fovea_r50_fpn_4x4_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/foveabox/fovea_r101_fpn_4x4_2x_coco.py b/configs/foveabox/fovea_r101_fpn_4x4_2x_coco.py
new file mode 100755
index 0000000..1ef5243
--- /dev/null
+++ b/configs/foveabox/fovea_r101_fpn_4x4_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fovea_r50_fpn_4x4_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py b/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
new file mode 100755
index 0000000..7e986eb
--- /dev/null
+++ b/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
@@ -0,0 +1,52 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='FOVEA',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        num_outs=5,
+        add_extra_convs='on_input'),
+    bbox_head=dict(
+        type='FoveaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        base_edge_list=[16, 32, 64, 128, 256],
+        scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)),
+        sigma=0.4,
+        with_deform=False,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=1.50,
+            alpha=0.4,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        nms_pre=1000,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
+data = dict(samples_per_gpu=4, workers_per_gpu=4)
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/foveabox/fovea_r50_fpn_4x4_2x_coco.py b/configs/foveabox/fovea_r50_fpn_4x4_2x_coco.py
new file mode 100755
index 0000000..68ce4d2
--- /dev/null
+++ b/configs/foveabox/fovea_r50_fpn_4x4_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './fovea_r50_fpn_4x4_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/foveabox/metafile.yml b/configs/foveabox/metafile.yml
new file mode 100755
index 0000000..fe9a283
--- /dev/null
+++ b/configs/foveabox/metafile.yml
@@ -0,0 +1,172 @@
+Collections:
+  - Name: FoveaBox
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 4x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.03797
+      Title: 'FoveaBox: Beyond Anchor-based Object Detector'
+    README: configs/foveabox/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fovea.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: fovea_r50_fpn_4x4_1x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      inference time (ms/im):
+        - value: 41.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219-ee4d5303.pth
+
+  - Name: fovea_r50_fpn_4x4_2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r50_fpn_4x4_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      inference time (ms/im):
+        - value: 41.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203-2df792b1.pth
+
+  - Name: fovea_align_r50_fpn_gn-head_4x4_2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.1
+      inference time (ms/im):
+        - value: 51.55
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth
+
+  - Name: fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.1
+      inference time (ms/im):
+        - value: 54.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205-85ce26cb.pth
+
+  - Name: fovea_r101_fpn_4x4_1x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r101_fpn_4x4_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 57.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219-05e38f1c.pth
+
+  - Name: fovea_r101_fpn_4x4_2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r101_fpn_4x4_2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.7
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208-02320ea4.pth
+
+  - Name: fovea_align_r101_fpn_gn-head_4x4_2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.7
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208-c39a027a.pth
+
+  - Name: fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.7
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208-649c5eb6.pth
diff --git a/configs/fpg/README.md b/configs/fpg/README.md
new file mode 100755
index 0000000..0ffd2e7
--- /dev/null
+++ b/configs/fpg/README.md
@@ -0,0 +1,43 @@
+# FPG
+
+> [Feature Pyramid Grids](https://arxiv.org/abs/2004.03580)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Feature pyramid networks have been widely adopted in the object detection literature to improve feature representations for better handling of variations in scale. In this paper, we present Feature Pyramid Grids (FPG), a deep multi-pathway feature pyramid, that represents the feature scale-space as a regular grid of parallel bottom-up pathways which are fused by multi-directional lateral connections. FPG can improve single-pathway feature pyramid networks by significantly increasing its performance at similar computation cost, highlighting importance of deep pyramid representations. In addition to its general and uniform structure, over complicated structures that have been found with neural architecture search, it also compares favorably against such approaches without relying on search. We hope that FPG with its uniform and effective nature can serve as a strong component for future work in object recognition.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143885611-85902399-2885-4a85-9126-9b9b7464ad08.png"/>
+</div>
+
+## Results and Models
+
+We benchmark the new training schedule (crop training, large batch, unfrozen BN, 50 epochs) introduced in NAS-FPN.
+All backbones are Resnet-50 in pytorch style.
+
+|    Method    |    Neck    | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                           Config                                                           |                                                                                                                                                                             Download                                                                                                                                                                             |
+| :----------: | :--------: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Faster R-CNN |    FPG     |   50e   |   20.0   |       -        |  42.3  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py)     |        [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth) \|               [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856.log.json)        |
+| Faster R-CNN | FPG-chn128 |   50e   |   11.9   |       -        |  41.2  |    -    | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857.log.json) |
+| Faster R-CNN |    FPN     |   50e   |   20.0   |       -        |  38.9  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py)     |        [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857-be7c9f42.pth) \|               [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857.log.json)        |
+|  Mask R-CNN  |    FPG     |   50e   |   23.2   |       -        |  43.0  |  38.1   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py)      |          [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth) \|                   [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857.log.json)          |
+|  Mask R-CNN  | FPG-chn128 |   50e   |   15.3   |       -        |  41.7  |  37.1   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth) \|     [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859.log.json)   |
+|  Mask R-CNN  |    FPN     |   50e   |   23.2   |       -        |  39.6  |  35.6   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py)      |          [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855-a756664a.pth) \|                   [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855.log.json)          |
+|  RetinaNet   |    FPG     |   50e   |   20.8   |       -        |  40.5  |    -    |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py)      |          [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth) \|                   [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809.log.json)          |
+|  RetinaNet   | FPG-chn128 |   50e   |   19.9   |       -        |  39.9  |    -    |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth) \|     [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829.log.json)   |
+
+**Note**: Chn128 means to decrease the number of channels of features and convs from 256 (default) to 128 in
+Neck and BBox Head, which can greatly decrease memory consumption without sacrificing much precision.
+
+## Citation
+
+```latex
+@article{chen2020feature,
+  title={Feature pyramid grids},
+  author={Chen, Kai and Cao, Yuhang and Loy, Chen Change and Lin, Dahua and Feichtenhofer, Christoph},
+  journal={arXiv preprint arXiv:2004.03580},
+  year={2020}
+}
+```
diff --git a/configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py b/configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py
new file mode 100755
index 0000000..4535034
--- /dev/null
+++ b/configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py
@@ -0,0 +1,9 @@
+_base_ = 'faster_rcnn_r50_fpg_crop640_50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(out_channels=128, inter_channels=128),
+    rpn_head=dict(in_channels=128),
+    roi_head=dict(
+        bbox_roi_extractor=dict(out_channels=128),
+        bbox_head=dict(in_channels=128)))
diff --git a/configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py b/configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py
new file mode 100755
index 0000000..3ab2a2c
--- /dev/null
+++ b/configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py
@@ -0,0 +1,48 @@
+_base_ = 'faster_rcnn_r50_fpn_crop640_50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(
+        type='FPG',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        inter_channels=256,
+        num_outs=5,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()]))
diff --git a/configs/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py b/configs/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py
new file mode 100755
index 0000000..e4ec940
--- /dev/null
+++ b/configs/fpg/faster_rcnn_r50_fpn_crop640_50e_coco.py
@@ -0,0 +1,73 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    backbone=dict(norm_cfg=norm_cfg, norm_eval=False),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(bbox_head=dict(norm_cfg=norm_cfg)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=(640, 640)),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=64),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# learning policy
+optimizer = dict(
+    type='SGD',
+    lr=0.08,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.1,
+    step=[30, 40])
+# runtime settings
+runner = dict(max_epochs=50)
+evaluation = dict(interval=2)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py b/configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py
new file mode 100755
index 0000000..baa4a5a
--- /dev/null
+++ b/configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py
@@ -0,0 +1,10 @@
+_base_ = 'mask_rcnn_r50_fpg_crop640_50e_coco.py'
+
+model = dict(
+    neck=dict(out_channels=128, inter_channels=128),
+    rpn_head=dict(in_channels=128),
+    roi_head=dict(
+        bbox_roi_extractor=dict(out_channels=128),
+        bbox_head=dict(in_channels=128),
+        mask_roi_extractor=dict(out_channels=128),
+        mask_head=dict(in_channels=128)))
diff --git a/configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py b/configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py
new file mode 100755
index 0000000..3c9ea27
--- /dev/null
+++ b/configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py
@@ -0,0 +1,48 @@
+_base_ = 'mask_rcnn_r50_fpn_crop640_50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(
+        type='FPG',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        inter_channels=256,
+        num_outs=5,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()]))
diff --git a/configs/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py b/configs/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py
new file mode 100755
index 0000000..c6bcc24
--- /dev/null
+++ b/configs/fpg/mask_rcnn_r50_fpn_crop640_50e_coco.py
@@ -0,0 +1,79 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    backbone=dict(norm_cfg=norm_cfg, norm_eval=False),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        norm_cfg=norm_cfg,
+        num_outs=5),
+    roi_head=dict(
+        bbox_head=dict(norm_cfg=norm_cfg), mask_head=dict(norm_cfg=norm_cfg)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=(640, 640)),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=64),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# learning policy
+optimizer = dict(
+    type='SGD',
+    lr=0.08,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.1,
+    step=[30, 40])
+# runtime settings
+runner = dict(max_epochs=50)
+evaluation = dict(interval=2)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/fpg/metafile.yml b/configs/fpg/metafile.yml
new file mode 100755
index 0000000..6b0a6a7
--- /dev/null
+++ b/configs/fpg/metafile.yml
@@ -0,0 +1,104 @@
+Collections:
+  - Name: Feature Pyramid Grids
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Feature Pyramid Grids
+    Paper:
+      URL: https://arxiv.org/abs/2004.03580
+      Title: 'Feature Pyramid Grids'
+    README: configs/fpg/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.10.0/mmdet/models/necks/fpg.py#L101
+      Version: v2.10.0
+
+Models:
+  - Name: faster_rcnn_r50_fpg_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/faster_rcnn_r50_fpg_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 20.0
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth
+
+  - Name: faster_rcnn_r50_fpg-chn128_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 11.9
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth
+
+  - Name: mask_rcnn_r50_fpg_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/mask_rcnn_r50_fpg_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 23.2
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth
+
+  - Name: mask_rcnn_r50_fpg-chn128_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 15.3
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth
+
+  - Name: retinanet_r50_fpg_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 20.8
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth
+
+  - Name: retinanet_r50_fpg-chn128_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 19.9
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights:  https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth
diff --git a/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py b/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py
new file mode 100755
index 0000000..9a6cf7e
--- /dev/null
+++ b/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py
@@ -0,0 +1,5 @@
+_base_ = 'retinanet_r50_fpg_crop640_50e_coco.py'
+
+model = dict(
+    neck=dict(out_channels=128, inter_channels=128),
+    bbox_head=dict(in_channels=128))
diff --git a/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py b/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py
new file mode 100755
index 0000000..504ed5e
--- /dev/null
+++ b/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py
@@ -0,0 +1,53 @@
+_base_ = '../nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(
+        _delete_=True,
+        type='FPG',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        inter_channels=256,
+        num_outs=5,
+        add_extra_convs=True,
+        start_level=1,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()]))
+
+evaluation = dict(interval=2)
diff --git a/configs/free_anchor/README.md b/configs/free_anchor/README.md
new file mode 100755
index 0000000..d24c340
--- /dev/null
+++ b/configs/free_anchor/README.md
@@ -0,0 +1,37 @@
+# FreeAnchor
+
+> [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https://arxiv.org/abs/1909.02466)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Modern CNN-based object detectors assign anchors for ground-truth objects under the restriction of object-anchor Intersection-over-Unit (IoU). In this study, we propose a learning-to-match approach to break IoU restriction, allowing objects to match anchors in a flexible manner. Our approach, referred to as FreeAnchor, updates hand-crafted anchor assignment to "free" anchor matching by formulating detector training as a maximum likelihood estimation (MLE) procedure. FreeAnchor targets at learning features which best explain a class of objects in terms of both classification and localization. FreeAnchor is implemented by optimizing detection customized likelihood and can be fused with CNN-based detectors in a plug-and-play manner. Experiments on COCO demonstrate that FreeAnchor consistently outperforms their counterparts with significant margins.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143886006-8374bf9b-28af-442e-9abf-eb16562bb7d3.png"/>
+</div>
+
+## Results and Models
+
+|  Backbone   |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                               Config                                                                |                                                                                                                                                                                   Download                                                                                                                                                                                    |
+| :---------: | :-----: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50     | pytorch |   1x    |   4.9    |      18.4      |  38.7  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130_095625.log.json)               |
+|    R-101    | pytorch |   1x    |   6.8    |      14.9      |  40.3  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130-358324e6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130_100723.log.json)             |
+| X-101-32x4d | pytorch |   1x    |   8.1    |      11.1      |  41.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130-d4846968.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130_095627.log.json) |
+
+**Notes:**
+
+- We use 8 GPUs with 2 images/GPU.
+- For more settings and models, please refer to the [official repo](https://github.com/zhangxiaosong18/FreeAnchor).
+
+## Citation
+
+```latex
+@inproceedings{zhang2019freeanchor,
+  title   =  {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection},
+  author  =  {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang},
+  booktitle =  {Neural Information Processing Systems},
+  year    =  {2019}
+}
+```
diff --git a/configs/free_anchor/metafile.yml b/configs/free_anchor/metafile.yml
new file mode 100755
index 0000000..170fb5c
--- /dev/null
+++ b/configs/free_anchor/metafile.yml
@@ -0,0 +1,79 @@
+Collections:
+  - Name: FreeAnchor
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FreeAnchor
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1909.02466
+      Title: 'FreeAnchor: Learning to Match Anchors for Visual Object Detection'
+    README: configs/free_anchor/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/free_anchor_retina_head.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: retinanet_free_anchor_r50_fpn_1x_coco
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.9
+      inference time (ms/im):
+        - value: 54.35
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth
+
+  - Name: retinanet_free_anchor_r101_fpn_1x_coco
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.8
+      inference time (ms/im):
+        - value: 67.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130-358324e6.pth
+
+  - Name: retinanet_free_anchor_x101_32x4d_fpn_1x_coco
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.1
+      inference time (ms/im):
+        - value: 90.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130-d4846968.pth
diff --git a/configs/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py b/configs/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..f4aea53
--- /dev/null
+++ b/configs/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_free_anchor_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py b/configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..28f983c
--- /dev/null
+++ b/configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='FreeAnchorRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.75)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py b/configs/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..65f8a9e
--- /dev/null
+++ b/configs/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './retinanet_free_anchor_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/fsaf/README.md b/configs/fsaf/README.md
new file mode 100755
index 0000000..4392a6e
--- /dev/null
+++ b/configs/fsaf/README.md
@@ -0,0 +1,57 @@
+# FSAF
+
+> [Feature Selective Anchor-Free Module for Single-Shot Object Detection](https://arxiv.org/abs/1903.00621)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We motivate and present feature selective anchor-free (FSAF) module, a simple and effective building block for single-shot object detectors. It can be plugged into single-shot detectors with feature pyramid structure. The FSAF module addresses two limitations brought up by the conventional anchor-based detection: 1) heuristic-guided feature selection; 2) overlap-based anchor sampling. The general concept of the FSAF module is online feature selection applied to the training of multi-level anchor-free branches. Specifically, an anchor-free branch is attached to each level of the feature pyramid, allowing box encoding and decoding in the anchor-free manner at an arbitrary level. During training, we dynamically assign each instance to the most suitable feature level. At the time of inference, the FSAF module can work jointly with anchor-based branches by outputting predictions in parallel. We instantiate this concept with simple implementations of anchor-free branches and online feature selection strategy. Experimental results on the COCO detection track show that our FSAF module performs better than anchor-based counterparts while being faster. When working jointly with anchor-based branches, the FSAF module robustly improves the baseline RetinaNet by a large margin under various settings, while introducing nearly free inference overhead. And the resulting best model can achieve a state-of-the-art 44.6% mAP, outperforming all existing single-shot detectors on COCO.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143887228-c790b542-c0f7-4113-8597-12250c50fe8f.png"/>
+</div>
+
+## Introduction
+
+FSAF is an anchor-free method published in CVPR2019 ([https://arxiv.org/pdf/1903.00621.pdf](https://arxiv.org/pdf/1903.00621.pdf)).
+Actually it is equivalent to the anchor-based method with only one anchor at each feature map position in each FPN level.
+And this is how we implemented it.
+Only the anchor-free branch is released for its better compatibility with the current framework and less computational budget.
+
+In the original paper, feature maps within the central 0.2-0.5 area of a gt box are tagged as ignored. However,
+it is empirically found that a hard threshold (0.2-0.2) gives a further gain on the performance. (see the table below)
+
+## Results and Models
+
+### Results on R50/R101/X101-FPN
+
+| Backbone | ignore range | ms-train | Lr schd | Train Mem (GB) | Train time (s/iter) | Inf time (fps) |   box AP    |                                                   Config                                                    |                                                                                                                                                                    Download                                                                                                                                                                     |
+| :------: | :----------: | :------: | :-----: | :------------: | :-----------------: | :------------: | :---------: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   |   0.2-0.5    |    N     |   1x    |      3.15      |        0.43         |      12.3      | 36.0 (35.9) |                                                                                                             | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco_20200715-b555b0e0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco_20200715_094657.log.json) |
+|   R-50   |   0.2-0.2    |    N     |   1x    |      3.15      |        0.43         |      13.0      |    37.4     |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fsaf/fsaf_r50_fpn_1x_coco.py)     |                                             [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco_20200428_072327.log.json)                                              |
+|  R-101   |   0.2-0.2    |    N     |   1x    |      5.08      |        0.58         |      10.8      | 39.3 (37.9) |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fsaf/fsaf_r101_fpn_1x_coco.py)    |                                           [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco-9e71098f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco_20200428_160348.log.json)                                            |
+|  X-101   |   0.2-0.2    |    N     |   1x    |      9.38      |        1.23         |      5.6       | 42.4 (41.0) | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py) |                               [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco-e3f6e6fd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco_20200428_160424.log.json)                                |
+
+**Notes:**
+
+- *1x means the model is trained for 12 epochs.*
+- *AP values in the brackets represent those reported in the original paper.*
+- *All results are obtained with a single model and single-scale test.*
+- *X-101 backbone represents ResNext-101-64x4d.*
+- *All pretrained backbones use pytorch style.*
+- *All models are trained on 8 Titan-XP gpus and tested on a single gpu.*
+
+## Citation
+
+BibTeX reference is as follows.
+
+```latex
+@inproceedings{zhu2019feature,
+  title={Feature Selective Anchor-Free Module for Single-Shot Object Detection},
+  author={Zhu, Chenchen and He, Yihui and Savvides, Marios},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={840--849},
+  year={2019}
+}
+```
diff --git a/configs/fsaf/fsaf_r101_fpn_1x_coco.py b/configs/fsaf/fsaf_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..12b49fe
--- /dev/null
+++ b/configs/fsaf/fsaf_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fsaf_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/fsaf/fsaf_r50_fpn_1x_coco.py b/configs/fsaf/fsaf_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..67f3ec1
--- /dev/null
+++ b/configs/fsaf/fsaf_r50_fpn_1x_coco.py
@@ -0,0 +1,48 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    type='FSAF',
+    bbox_head=dict(
+        type='FSAFHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        reg_decoded_bbox=True,
+        # Only anchor-free branch is implemented. The anchor generator only
+        #  generates 1 anchor at each feature point, as a substitute of the
+        #  grid of features.
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=1,
+            scales_per_octave=1,
+            ratios=[1.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(_delete_=True, type='TBLRBBoxCoder', normalizer=4.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0,
+            reduction='none'),
+        loss_bbox=dict(
+            _delete_=True,
+            type='IoULoss',
+            eps=1e-6,
+            loss_weight=1.0,
+            reduction='none')),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            _delete_=True,
+            type='CenterRegionAssigner',
+            pos_scale=0.2,
+            neg_scale=0.2,
+            min_pos_iof=0.01),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=10, norm_type=2))
diff --git a/configs/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py b/configs/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..89c0c63
--- /dev/null
+++ b/configs/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './fsaf_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/fsaf/metafile.yml b/configs/fsaf/metafile.yml
new file mode 100755
index 0000000..5434e9a
--- /dev/null
+++ b/configs/fsaf/metafile.yml
@@ -0,0 +1,80 @@
+Collections:
+  - Name: FSAF
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x Titan-XP GPUs
+      Architecture:
+        - FPN
+        - FSAF
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1903.00621
+      Title: 'Feature Selective Anchor-Free Module for Single-Shot Object Detection'
+    README: configs/fsaf/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/fsaf.py#L6
+      Version: v2.1.0
+
+Models:
+  - Name: fsaf_r50_fpn_1x_coco
+    In Collection: FSAF
+    Config: configs/fsaf/fsaf_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.15
+      inference time (ms/im):
+        - value: 76.92
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth
+
+  - Name: fsaf_r101_fpn_1x_coco
+    In Collection: FSAF
+    Config: configs/fsaf/fsaf_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.08
+      inference time (ms/im):
+        - value: 92.59
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.3 (37.9)
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco-9e71098f.pth
+
+  - Name: fsaf_x101_64x4d_fpn_1x_coco
+    In Collection: FSAF
+    Config: configs/fsaf/fsaf_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.38
+      inference time (ms/im):
+        - value: 178.57
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4 (41.0)
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco-e3f6e6fd.pth
diff --git a/configs/gcnet/README.md b/configs/gcnet/README.md
new file mode 100755
index 0000000..403e086
--- /dev/null
+++ b/configs/gcnet/README.md
@@ -0,0 +1,69 @@
+# GCNet
+
+> [GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond](https://arxiv.org/abs/1904.11492)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The Non-Local Network (NLNet) presents a pioneering approach for capturing long-range dependencies, via aggregating query-specific global context to each query position. However, through a rigorous empirical analysis, we have found that the global contexts modeled by non-local network are almost the same for different query positions within an image. In this paper, we take advantage of this finding to create a simplified network based on a query-independent formulation, which maintains the accuracy of NLNet but with significantly less computation. We further observe that this simplified design shares similar structure with Squeeze-Excitation Network (SENet). Hence we unify them into a three-step general framework for global context modeling. Within the general framework, we design a better instantiation, called the global context (GC) block, which is lightweight and can effectively model the global context. The lightweight property allows us to apply it for multiple layers in a backbone network to construct a global context network (GCNet), which generally outperforms both simplified NLNet and SENet on major benchmarks for various recognition tasks.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143887418-c5d2c906-d6c9-415d-bce7-c5355c20a10a.png" height="300"/>
+</div>
+
+## Introduction
+
+By [Yue Cao](http://yue-cao.me), [Jiarui Xu](http://jerryxu.net), [Stephen Lin](https://scholar.google.com/citations?user=c3PYmxUAAAAJ&hl=en), Fangyun Wei, [Han Hu](https://sites.google.com/site/hanhushomepage/).
+
+We provide config files to reproduce the results in the paper for
+["GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond"](https://arxiv.org/abs/1904.11492) on COCO object detection.
+
+**GCNet** is initially described in [arxiv](https://arxiv.org/abs/1904.11492). Via absorbing advantages of Non-Local Networks (NLNet) and Squeeze-Excitation Networks (SENet),  GCNet provides a simple, fast and effective approach for global context modeling, which generally outperforms both NLNet and SENet on major benchmarks for various recognition tasks.
+
+## Results and Models
+
+The results on COCO 2017val are shown in the below table.
+
+| Backbone  | Model |    Context     | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                          Config                                                           |                                                                                                                                                                       Download                                                                                                                                                                       |
+| :-------: | :---: | :------------: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN  | Mask  | GC(c3-c5, r16) |   1x    |   5.0    |                |  39.7  |  35.9   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915-187da160.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915.log.json) |
+| R-50-FPN  | Mask  | GC(c3-c5, r4)  |   1x    |   5.1    |      15.0      |  39.9  |  36.0   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py)  |      [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204-17235656.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204_024626.log.json)       |
+| R-101-FPN | Mask  | GC(c3-c5, r16) |   1x    |   7.6    |      11.4      |  41.3  |  37.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py) |  [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205-e58ae947.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205_192835.log.json)   |
+| R-101-FPN | Mask  | GC(c3-c5, r4)  |   1x    |   7.8    |      11.6      |  42.2  |  37.8   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py)  |    [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206-af22dc9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206_112128.log.json)     |
+
+| Backbone  |      Model       |    Context     | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                               Config                                                                                |                                                                                                                                                                                                                                                             Download                                                                                                                                                                                                                                                             |
+| :-------: | :--------------: | :------------: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN  |       Mask       |       -        |   1x    |   4.4    |      16.6      |  38.4  |  34.6   |                     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py)                      |                                                                                      [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202-bb3eb55c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202_214122.log.json)                                                                                       |
+| R-50-FPN  |       Mask       | GC(c3-c5, r16) |   1x    |   5.0    |      15.5      |  40.4  |  36.2   |              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py)               |                                                          [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202_174907.log.json)                                                           |
+| R-50-FPN  |       Mask       | GC(c3-c5, r4)  |   1x    |   5.1    |      15.1      |  40.7  |  36.5   |               [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py)               |                                                            [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202_085547.log.json)                                                             |
+| R-101-FPN |       Mask       |       -        |   1x    |   6.4    |      13.3      |  40.5  |  36.3   |                     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py)                     |                                                                                    [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210-81658c8a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210_220422.log.json)                                                                                     |
+| R-101-FPN |       Mask       | GC(c3-c5, r16) |   1x    |   7.6    |      12.0      |  42.2  |  37.8   |              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py)              |                                                        [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207-945e77ca.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207_015330.log.json)                                                         |
+| R-101-FPN |       Mask       | GC(c3-c5, r4)  |   1x    |   7.8    |      11.8      |  42.2  |  37.8   |              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py)               |                                                          [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206_142508.log.json)                                                           |
+| X-101-FPN |       Mask       |       -        |   1x    |   7.6    |      11.3      |  42.4  |  37.7   |                  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py)                  |                                                                        [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211-7584841c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211_054326.log.json)                                                                         |
+| X-101-FPN |       Mask       | GC(c3-c5, r16) |   1x    |   8.8    |      9.8       |  43.5  |  38.6   |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py)           |                                            [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-cbed3d2c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211_164715.log.json)                                             |
+| X-101-FPN |       Mask       | GC(c3-c5, r4)  |   1x    |   9.0    |      9.7       |  43.9  |  39.0   |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py)            |                                              [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212-68164964.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212_070942.log.json)                                               |
+| X-101-FPN |   Cascade Mask   |       -        |   1x    |   9.2    |      8.4       |  44.7  |  38.6   |              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py)              |                                                        [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310-d5ad2a5e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310_115217.log.json)                                                         |
+| X-101-FPN |   Cascade Mask   | GC(c3-c5, r16) |   1x    |   10.3   |      7.7       |  46.2  |  39.7   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py)       |                            [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-10bf2463.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211_184154.log.json)                             |
+| X-101-FPN |   Cascade Mask   | GC(c3-c5, r4)  |   1x    |   10.6   |                |  46.4  |  40.1   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653-ed035291.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653.log.json)                           |
+| X-101-FPN | DCN Cascade Mask |       -        |   1x    |          |                |  47.5  |  40.9   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py)        |                             [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019-abbc39ea.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019.log.json)                             |
+| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r16) |   1x    |          |                |  48.0  |  41.3   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648-44aa598a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648.log.json) |
+| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r4)  |   1x    |          |                |  47.9  |  41.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851-720338ec.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851.log.json)   |
+
+**Notes:**
+
+- The `SyncBN` is added in the backbone for all models in **Table 2**.
+- `GC` denotes Global Context (GC) block is inserted after 1x1 conv of backbone.
+- `DCN` denotes replace 3x3 conv with 3x3 Deformable Convolution in `c3-c5` stages of backbone.
+- `r4` and `r16` denote ratio 4 and ratio 16 in GC block respectively.
+
+## Citation
+
+```latex
+@article{cao2019GCNet,
+  title={GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond},
+  author={Cao, Yue and Xu, Jiarui and Lin, Stephen and Wei, Fangyun and Hu, Han},
+  journal={arXiv preprint arXiv:1904.11492},
+  year={2019}
+}
+```
diff --git a/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py
new file mode 100755
index 0000000..5118895
--- /dev/null
+++ b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..413499d
--- /dev/null
+++ b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..50689aa
--- /dev/null
+++ b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..1367231
--- /dev/null
+++ b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..50883ff
--- /dev/null
+++ b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..31fdd07
--- /dev/null
+++ b/configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..ad6ad47
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/configs/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..29f9167
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 4),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py b/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py
new file mode 100755
index 0000000..6e1c5d0
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..781dba7
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..32972de
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..d299b69
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..5ac908e
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 4),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py b/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py
new file mode 100755
index 0000000..0308a56
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..e04780c
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..980f819
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py b/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py
new file mode 100755
index 0000000..f0c96e5
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..7fb8e82
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py b/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
new file mode 100755
index 0000000..b1ddbee
--- /dev/null
+++ b/configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/configs/gcnet/metafile.yml b/configs/gcnet/metafile.yml
new file mode 100755
index 0000000..1281122
--- /dev/null
+++ b/configs/gcnet/metafile.yml
@@ -0,0 +1,440 @@
+Collections:
+  - Name: GCNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Global Context Block
+        - FPN
+        - RPN
+        - ResNet
+        - ResNeXt
+    Paper:
+      URL: https://arxiv.org/abs/1904.11492
+      Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    README: configs/gcnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/context_block.py#L13
+      Version: v2.0.0
+
+Models:
+  - Name: mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915-187da160.pth
+
+  - Name: mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204-17235656.pth
+
+  - Name: mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 87.72
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205-e58ae947.pth
+
+  - Name: mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      inference time (ms/im):
+        - value: 86.21
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206-af22dc9d.pth
+
+  - Name: mask_rcnn_r50_fpn_syncbn-backbone_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 60.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  34.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202-bb3eb55c.pth
+
+  - Name: mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      inference time (ms/im):
+        - value: 64.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth
+
+  - Name: mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      inference time (ms/im):
+        - value: 66.23
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth
+
+  - Name: mask_rcnn_r101_fpn_syncbn-backbone_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 75.19
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210-81658c8a.pth
+
+  - Name: mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 83.33
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207-945e77ca.pth
+
+  - Name: mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      inference time (ms/im):
+        - value: 84.75
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211-7584841c.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.8
+      inference time (ms/im):
+        - value: 102.04
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-cbed3d2c.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      inference time (ms/im):
+        - value: 103.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212-68164964.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 119.05
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310-d5ad2a5e.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 129.87
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-10bf2463.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:    40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653-ed035291.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019-abbc39ea.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648-44aa598a.pth
+
+  - Name: cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:   41.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851-720338ec.pth
diff --git a/configs/gfl/README.md b/configs/gfl/README.md
new file mode 100755
index 0000000..703936b
--- /dev/null
+++ b/configs/gfl/README.md
@@ -0,0 +1,42 @@
+# GFL
+
+> [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https://arxiv.org/abs/2006.04388)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+One-stage detector basically formulates object detection as dense classification and localization. The classification is usually optimized by Focal Loss and the box location is commonly learned under Dirac delta distribution. A recent trend for one-stage detectors is to introduce an individual prediction branch to estimate the quality of localization, where the predicted quality facilitates the classification to improve detection performance. This paper delves into the representations of the above three fundamental elements: quality estimation, classification and localization. Two problems are discovered in existing practices, including (1) the inconsistent usage of the quality estimation and classification between training and inference and (2) the inflexible Dirac delta distribution for localization when there is ambiguity and uncertainty in complex scenes. To address the problems, we design new representations for these elements. Specifically, we merge the quality estimation into the class prediction vector to form a joint representation of localization quality and classification, and use a vector to represent arbitrary distribution of box locations. The improved representations eliminate the inconsistency risk and accurately depict the flexible distribution in real data, but contain continuous labels, which is beyond the scope of Focal Loss. We then propose Generalized Focal Loss (GFL) that generalizes Focal Loss from its discrete form to the continuous version for successful optimization. On COCO test-dev, GFL achieves 45.0% AP using ResNet-101 backbone, surpassing state-of-the-art SAPD (43.5%) and ATSS (43.6%) with higher or comparable inference speed, under the same backbone and training settings. Notably, our best model can achieve a single-model single-scale AP of 48.2%, at 10 FPS on a single 2080Ti GPU.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143887865-44dc384d-ba0d-44e8-b3d7-d5fa837838cf.png"/>
+</div>
+
+## Results and Models
+
+|     Backbone      |  Style  | Lr schd | Multi-scale Training | Inf time (fps) | box AP |                                                            Config                                                             |                                                                                                                                                                                   Download                                                                                                                                                                                   |
+| :---------------: | :-----: | :-----: | :------------------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       R-50        | pytorch |   1x    |          No          |      19.5      |  40.2  |              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r50_fpn_1x_coco.py)               |                                                       [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244.log.json)                                                       |
+|       R-50        | pytorch |   2x    |         Yes          |      19.5      |  42.9  |          [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py)           |                                       [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802-37bb1edc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802.log.json)                                       |
+|       R-101       | pytorch |   2x    |         Yes          |      14.7      |  44.7  |          [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py)          |                                     [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126.log.json)                                     |
+|    R-101-dcnv2    | pytorch |   2x    |         Yes          |      12.9      |  47.1  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002.log.json)             |
+|    X-101-32x4d    | pytorch |   2x    |         Yes          |      12.1      |  45.9  |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py)       |                         [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002-50c1ffdb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002.log.json)                         |
+| X-101-32x4d-dcnv2 | pytorch |   2x    |         Yes          |      10.7      |  48.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002-14a2bf25.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002.log.json) |
+
+\[1\] *1x and 2x mean the model is trained for 90K and 180K iterations, respectively.* \
+\[2\] *All results are obtained with a single model and without any test time data augmentation such as multi-scale, flipping and etc..* \
+\[3\] *`dcnv2` denotes deformable convolutional networks v2.* \
+\[4\] *FPS is tested with a single GeForce RTX 2080Ti GPU, using a batch size of 1.*
+
+## Citation
+
+We provide config files to reproduce the object detection results in the paper [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https://arxiv.org/abs/2006.04388)
+
+```latex
+@article{li2020generalized,
+  title={Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection},
+  author={Li, Xiang and Wang, Wenhai and Wu, Lijun and Chen, Shuo and Hu, Xiaolin and Li, Jun and Tang, Jinhui and Yang, Jian},
+  journal={arXiv preprint arXiv:2006.04388},
+  year={2020}
+}
+```
diff --git a/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py b/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..b72c2b6
--- /dev/null
+++ b/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './gfl_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py b/configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..e33b5c0
--- /dev/null
+++ b/configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './gfl_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/gfl/gfl_r50_fpn_1x_coco.py b/configs/gfl/gfl_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..cfd4b02
--- /dev/null
+++ b/configs/gfl/gfl_r50_fpn_1x_coco.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='GFL',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='GFLHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        reg_max=16,
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py b/configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..b8be601
--- /dev/null
+++ b/configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py
@@ -0,0 +1,22 @@
+_base_ = './gfl_r50_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+# multi-scale training
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 480), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py b/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..2539807
--- /dev/null
+++ b/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './gfl_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    type='GFL',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py b/configs/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..effda19
--- /dev/null
+++ b/configs/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './gfl_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    type='GFL',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/gfl/metafile.yml b/configs/gfl/metafile.yml
new file mode 100755
index 0000000..8f049c6
--- /dev/null
+++ b/configs/gfl/metafile.yml
@@ -0,0 +1,134 @@
+Collections:
+  - Name: Generalized Focal Loss
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Generalized Focal Loss
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2006.04388
+      Title: 'Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection'
+    README: configs/gfl/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/gfl.py#L6
+      Version: v2.2.0
+
+Models:
+  - Name: gfl_r50_fpn_1x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r50_fpn_1x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 51.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth
+
+  - Name: gfl_r50_fpn_mstrain_2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 51.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802-37bb1edc.pth
+
+  - Name: gfl_r101_fpn_mstrain_2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth
+
+  - Name: gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth
+
+  - Name: gfl_x101_32x4d_fpn_mstrain_2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002-50c1ffdb.pth
+
+  - Name: gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 93.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002-14a2bf25.pth
diff --git a/configs/ghm/README.md b/configs/ghm/README.md
new file mode 100755
index 0000000..cf9fb73
--- /dev/null
+++ b/configs/ghm/README.md
@@ -0,0 +1,33 @@
+# GHM
+
+> [Gradient Harmonized Single-stage Detector](https://arxiv.org/abs/1811.05181)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Despite the great success of two-stage detectors, single-stage detector is still a more elegant and efficient way, yet suffers from the two well-known disharmonies during training, i.e. the huge difference in quantity between positive and negative examples as well as between easy and hard examples. In this work, we first point out that the essential effect of the two disharmonies can be summarized in term of the gradient. Further, we propose a novel gradient harmonizing mechanism (GHM) to be a hedging for the disharmonies. The philosophy behind GHM can be easily embedded into both classification loss function like cross-entropy (CE) and regression loss function like smooth-L1 (SL1) loss. To this end, two novel loss functions called GHM-C and GHM-R are designed to balancing the gradient flow for anchor classification and bounding box refinement, respectively. Ablation study on MS COCO demonstrates that without laborious hyper-parameter tuning, both GHM-C and GHM-R can bring substantial improvement for single-stage detector. Without any whistles and bells, our model achieves 41.6 mAP on COCO test-dev set which surpasses the state-of-the-art method, Focal Loss (FL) + SL1, by 0.8.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143889057-0341f32b-1291-4b9a-8444-52ad266ae709.png"/>
+</div>
+
+## Results and Models
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                       Config                                                        |                                                                                                                                                           Download                                                                                                                                                            |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |   1x    |   4.0    |      3.3       |  37.0  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130_004213.log.json)               |
+|    R-101-FPN    | pytorch |   1x    |   6.0    |      4.4       |  39.1  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_r101_fpn_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130-c148ee8f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130_145259.log.json)             |
+| X-101-32x4d-FPN | pytorch |   1x    |   7.2    |      5.1       |  40.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131-e4333bd0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131_113653.log.json) |
+| X-101-64x4d-FPN | pytorch |   1x    |   10.3   |      5.2       |  41.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131-dd381cef.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131_113723.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{li2019gradient,
+  title={Gradient Harmonized Single-stage Detector},
+  author={Li, Buyu and Liu, Yu and Wang, Xiaogang},
+  booktitle={AAAI Conference on Artificial Intelligence},
+  year={2019}
+}
+```
diff --git a/configs/ghm/metafile.yml b/configs/ghm/metafile.yml
new file mode 100755
index 0000000..b4f488c
--- /dev/null
+++ b/configs/ghm/metafile.yml
@@ -0,0 +1,101 @@
+Collections:
+  - Name: GHM
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - GHM-C
+        - GHM-R
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1811.05181
+      Title: 'Gradient Harmonized Single-stage Detector'
+    README: configs/ghm/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/losses/ghm_loss.py#L21
+      Version: v2.0.0
+
+Models:
+  - Name: retinanet_ghm_r50_fpn_1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 303.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth
+
+  - Name: retinanet_ghm_r101_fpn_1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_ghm_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 227.27
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130-c148ee8f.pth
+
+  - Name: retinanet_ghm_x101_32x4d_fpn_1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      inference time (ms/im):
+        - value: 196.08
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131-e4333bd0.pth
+
+  - Name: retinanet_ghm_x101_64x4d_fpn_1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 192.31
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131-dd381cef.pth
diff --git a/configs/ghm/retinanet_ghm_r101_fpn_1x_coco.py b/configs/ghm/retinanet_ghm_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..aaf6fc2
--- /dev/null
+++ b/configs/ghm/retinanet_ghm_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_ghm_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py b/configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..61b9751
--- /dev/null
+++ b/configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        loss_cls=dict(
+            _delete_=True,
+            type='GHMC',
+            bins=30,
+            momentum=0.75,
+            use_sigmoid=True,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            _delete_=True,
+            type='GHMR',
+            mu=0.02,
+            bins=10,
+            momentum=0.7,
+            loss_weight=10.0)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py b/configs/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..cd2e4cc
--- /dev/null
+++ b/configs/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_ghm_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py b/configs/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..b6107d8
--- /dev/null
+++ b/configs/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_ghm_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/gn+ws/README.md b/configs/gn+ws/README.md
new file mode 100755
index 0000000..184bed3
--- /dev/null
+++ b/configs/gn+ws/README.md
@@ -0,0 +1,54 @@
+# GN + WS
+
+> [Weight Standardization](https://arxiv.org/abs/1903.10520)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Batch Normalization (BN) has become an out-of-box technique to improve deep network training. However, its effectiveness is limited for micro-batch training, i.e., each GPU typically has only 1-2 images for training, which is inevitable for many computer vision tasks, e.g., object detection and semantic segmentation, constrained by memory consumption. To address this issue, we propose Weight Standardization (WS) and Batch-Channel Normalization (BCN) to bring two success factors of BN into micro-batch training: 1) the smoothing effects on the loss landscape and 2) the ability to avoid harmful elimination singularities along the training trajectory. WS standardizes the weights in convolutional layers to smooth the loss landscape by reducing the Lipschitz constants of the loss and the gradients; BCN combines batch and channel normalizations and leverages estimated statistics of the activations in convolutional layers to keep networks away from elimination singularities. We validate WS and BCN on comprehensive computer vision tasks, including image classification, object detection, instance segmentation, video recognition and semantic segmentation. All experimental results consistently show that WS and BCN improve micro-batch training significantly. Moreover, using WS and BCN with micro-batch training is even able to match or outperform the performances of BN with large-batch training.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143889309-b1f3487d-abfe-4615-befc-79384bc8b65c.png"/ height="200">
+</div>
+
+## Results and Models
+
+Faster R-CNN
+
+|    Backbone     |  Style  | Normalization | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                             Config                                                              |                                                                                                                                                                               Download                                                                                                                                                                                |
+| :-------------: | :-----: | :-----------: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |     GN+WS     |   1x    |   5.9    |      11.7      |  39.7  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130_210936.log.json)               |
+|    R-101-FPN    | pytorch |     GN+WS     |   1x    |   8.9    |      9.0       |  41.7  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205-a93b0d75.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205_232146.log.json)             |
+| X-50-32x4d-FPN  | pytorch |     GN+WS     |   1x    |   7.0    |      10.3      |  40.7  |    -    | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203-839c5d9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203_220113.log.json)   |
+| X-101-32x4d-FPN | pytorch |     GN+WS     |   1x    |   10.8   |      7.6       |  42.1  |    -    | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212-27da1bc2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212_195302.log.json) |
+
+Mask R-CNN
+
+|    Backbone     |  Style  | Normalization |  Lr schd  | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                Config                                                                |                                                                                                                                                                                         Download                                                                                                                                                                                          |
+| :-------------: | :-----: | :-----------: | :-------: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |     GN+WS     |    2x     |   7.3    |      10.5      |  40.6  |  36.6   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py)        |                             [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226-16acb762.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226_062128.log.json)                             |
+|    R-101-FPN    | pytorch |     GN+WS     |    2x     |   10.3   |      8.6       |  42.0  |  37.7   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212-ea357cd9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212_213627.log.json)                           |
+| X-50-32x4d-FPN  | pytorch |     GN+WS     |    2x     |   8.4    |      9.3       |  41.1  |  37.0   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216-649fdb6f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216_201500.log.json)                 |
+| X-101-32x4d-FPN | pytorch |     GN+WS     |    2x     |   12.2   |      7.1       |  42.1  |  37.9   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319-33fb95b5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319_104101.log.json)               |
+|    R-50-FPN     | pytorch |     GN+WS     | 20-23-24e |   7.3    |       -        |  41.1  |  37.1   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213-487d1283.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213_035123.log.json)               |
+|    R-101-FPN    | pytorch |     GN+WS     | 20-23-24e |   10.3   |       -        |  43.1  |  38.6   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213-57b5a50f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213_130142.log.json)             |
+| X-50-32x4d-FPN  | pytorch |     GN+WS     | 20-23-24e |   8.4    |       -        |  42.1  |  38.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226-969bcb2c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226_093732.log.json)   |
+| X-101-32x4d-FPN | pytorch |     GN+WS     | 20-23-24e |   12.2   |       -        |  42.7  |  38.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316-e6cd35ef.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316_013741.log.json) |
+
+Note:
+
+- GN+WS requires about 5% more memory than GN, and it is only 5% slower than GN.
+- In the paper, a 20-23-24e lr schedule is used instead of 2x.
+- The X-50-GN and X-101-GN pretrained models are also shared by the authors.
+
+## Citation
+
+```latex
+@article{weightstandardization,
+  author    = {Siyuan Qiao and Huiyu Wang and Chenxi Liu and Wei Shen and Alan Yuille},
+  title     = {Weight Standardization},
+  journal   = {arXiv preprint arXiv:1903.10520},
+  year      = {2019},
+}
+```
diff --git a/configs/gn+ws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py b/configs/gn+ws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py
new file mode 100755
index 0000000..cd2cb2b
--- /dev/null
+++ b/configs/gn+ws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws')))
diff --git a/configs/gn+ws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py b/configs/gn+ws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py
new file mode 100755
index 0000000..1b326b8
--- /dev/null
+++ b/configs/gn+ws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')),
+    neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)))
diff --git a/configs/gn+ws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py b/configs/gn+ws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py
new file mode 100755
index 0000000..f64ae89
--- /dev/null
+++ b/configs/gn+ws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws')))
diff --git a/configs/gn+ws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py b/configs/gn+ws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py
new file mode 100755
index 0000000..246851b
--- /dev/null
+++ b/configs/gn+ws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=50,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws')))
diff --git a/configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py b/configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py
new file mode 100755
index 0000000..a790d93
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py'
+# learning policy
+lr_config = dict(step=[20, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py b/configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py
new file mode 100755
index 0000000..a9fa6a2
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws')))
diff --git a/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py b/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py
new file mode 100755
index 0000000..5516808
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py'
+# learning policy
+lr_config = dict(step=[20, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py b/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py
new file mode 100755
index 0000000..63be60f
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py
@@ -0,0 +1,20 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')),
+    neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg),
+        mask_head=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg)))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py b/configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py
new file mode 100755
index 0000000..cfa14c9
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py'
+# learning policy
+lr_config = dict(step=[20, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py b/configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py
new file mode 100755
index 0000000..6498b03
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py
@@ -0,0 +1,19 @@
+_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py'
+# model settings
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws')))
diff --git a/configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py b/configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py
new file mode 100755
index 0000000..79ce0ad
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py'
+# learning policy
+lr_config = dict(step=[20, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py b/configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py
new file mode 100755
index 0000000..7fac317
--- /dev/null
+++ b/configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py
@@ -0,0 +1,19 @@
+_base_ = './mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py'
+# model settings
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=50,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws')))
diff --git a/configs/gn+ws/metafile.yml b/configs/gn+ws/metafile.yml
new file mode 100755
index 0000000..6cfcb07
--- /dev/null
+++ b/configs/gn+ws/metafile.yml
@@ -0,0 +1,263 @@
+Collections:
+  - Name: Weight Standardization
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Group Normalization
+        - Weight Standardization
+    Paper:
+      URL: https://arxiv.org/abs/1903.10520
+      Title: 'Weight Standardization'
+    README: configs/gn+ws/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py
+      Version: v2.0.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_gn_ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth
+
+  - Name: faster_rcnn_r101_fpn_gn_ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.9
+      inference time (ms/im):
+        - value: 111.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205-a93b0d75.pth
+
+  - Name: faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 97.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203-839c5d9d.pth
+
+  - Name: faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      inference time (ms/im):
+        - value: 131.58
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212-27da1bc2.pth
+
+  - Name: mask_rcnn_r50_fpn_gn_ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      inference time (ms/im):
+        - value: 95.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226-16acb762.pth
+
+  - Name: mask_rcnn_r101_fpn_gn_ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212-ea357cd9.pth
+
+  - Name: mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.4
+      inference time (ms/im):
+        - value: 107.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216-649fdb6f.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      inference time (ms/im):
+        - value: 140.85
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319-33fb95b5.pth
+
+  - Name: mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213-487d1283.pth
+
+  - Name: mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213-57b5a50f.pth
+
+  - Name: mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco.py
+    Metadata:
+      Training Memory (GB): 8.4
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226-969bcb2c.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn+ws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316-e6cd35ef.pth
diff --git a/configs/gn/README.md b/configs/gn/README.md
new file mode 100755
index 0000000..9bb2888
--- /dev/null
+++ b/configs/gn/README.md
@@ -0,0 +1,41 @@
+# GN
+
+> [Group Normalization](https://arxiv.org/abs/1803.08494)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Batch Normalization (BN) is a milestone technique in the development of deep learning, enabling various networks to train. However, normalizing along the batch dimension introduces problems --- BN's error increases rapidly when the batch size becomes smaller, caused by inaccurate batch statistics estimation. This limits BN's usage for training larger models and transferring features to computer vision tasks including detection, segmentation, and video, which require small batches constrained by memory consumption. In this paper, we present Group Normalization (GN) as a simple alternative to BN. GN divides the channels into groups and computes within each group the mean and variance for normalization. GN's computation is independent of batch sizes, and its accuracy is stable in a wide range of batch sizes. On ResNet-50 trained in ImageNet, GN has 10.6% lower error than its BN counterpart when using a batch size of 2; when using typical batch sizes, GN is comparably good with BN and outperforms other normalization variants. Moreover, GN can be naturally transferred from pre-training to fine-tuning. GN can outperform its BN-based counterparts for object detection and segmentation in COCO, and for video classification in Kinetics, showing that GN can effectively replace the powerful BN in a variety of tasks. GN can be easily implemented by a few lines of code in modern libraries.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143889171-d4660307-3a4f-4ac2-a0ad-7d17c2f045b1.png" height="200"/>
+</div>
+
+## Results and Models
+
+|   Backbone    |   model    | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                         Config                                                         |                                                                                                                                                                  Download                                                                                                                                                                   |
+| :-----------: | :--------: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN (d)  | Mask R-CNN |   2x    |   7.1    |      11.0      |  40.2  |  36.4   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206_050355.log.json)                 |
+| R-50-FPN (d)  | Mask R-CNN |   3x    |   7.1    |       -        |  40.5  |  36.7   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214-8b23b1e5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214_063512.log.json)                 |
+| R-101-FPN (d) | Mask R-CNN |   2x    |   9.9    |      9.0       |  41.9  |  37.6   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205-d96b1b50.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205_234402.log.json)               |
+| R-101-FPN (d) | Mask R-CNN |   3x    |   9.9    |                |  42.1  |  38.0   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py)     |           [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609-0df864f4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609.log.json)            |
+| R-50-FPN (c)  | Mask R-CNN |   2x    |   7.1    |      10.9      |  40.0  |  36.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207-20d3e849.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207_225832.log.json) |
+| R-50-FPN (c)  | Mask R-CNN |   3x    |   7.1    |       -        |  40.1  |  36.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225-542aefbc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225_235135.log.json) |
+
+**Notes:**
+
+- (d) means pretrained model converted from Detectron, and (c) means the contributed model pretrained by [@thangvubk](https://github.com/thangvubk).
+- The `3x` schedule is epoch \[28, 34, 36\].
+- **Memory, Train/Inf time is outdated.**
+
+## Citation
+
+```latex
+@inproceedings{wu2018group,
+  title={Group Normalization},
+  author={Wu, Yuxin and He, Kaiming},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2018}
+}
+```
diff --git a/configs/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py b/configs/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py
new file mode 100755
index 0000000..a505ba0
--- /dev/null
+++ b/configs/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask_rcnn_r50_fpn_gn-all_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet101_gn')))
diff --git a/configs/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py b/configs/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py
new file mode 100755
index 0000000..12a9d17
--- /dev/null
+++ b/configs/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py
@@ -0,0 +1,5 @@
+_base_ = './mask_rcnn_r101_fpn_gn-all_2x_coco.py'
+
+# learning policy
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py b/configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py
new file mode 100755
index 0000000..1de7d98
--- /dev/null
+++ b/configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py
@@ -0,0 +1,49 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_gn')),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py b/configs/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py
new file mode 100755
index 0000000..f917719
--- /dev/null
+++ b/configs/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py
@@ -0,0 +1,5 @@
+_base_ = './mask_rcnn_r50_fpn_gn-all_2x_coco.py'
+
+# learning policy
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py b/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py
new file mode 100755
index 0000000..2f430fd
--- /dev/null
+++ b/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://contrib/resnet50_gn')),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py b/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py
new file mode 100755
index 0000000..66834f0
--- /dev/null
+++ b/configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py
@@ -0,0 +1,5 @@
+_base_ = './mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py'
+
+# learning policy
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/gn/metafile.yml b/configs/gn/metafile.yml
new file mode 100755
index 0000000..4a1ecae
--- /dev/null
+++ b/configs/gn/metafile.yml
@@ -0,0 +1,162 @@
+Collections:
+  - Name: Group Normalization
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Group Normalization
+    Paper:
+      URL: https://arxiv.org/abs/1803.08494
+      Title: 'Group Normalization'
+    README: configs/gn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py
+      Version: v2.0.0
+
+Models:
+  - Name: mask_rcnn_r50_fpn_gn-all_2x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth
+
+  - Name: mask_rcnn_r50_fpn_gn-all_3x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask_rcnn_r50_fpn_gn-all_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214-8b23b1e5.pth
+
+  - Name: mask_rcnn_r101_fpn_gn-all_2x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask_rcnn_r101_fpn_gn-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 111.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205-d96b1b50.pth
+
+  - Name: mask_rcnn_r101_fpn_gn-all_3x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask_rcnn_r101_fpn_gn-all_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 111.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609-0df864f4.pth
+
+  - Name: mask_rcnn_r50_fpn_gn-all_contrib_2x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 91.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207-20d3e849.pth
+
+  - Name: mask_rcnn_r50_fpn_gn-all_contrib_3x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 91.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225-542aefbc.pth
diff --git a/configs/grid_rcnn/README.md b/configs/grid_rcnn/README.md
new file mode 100755
index 0000000..e844021
--- /dev/null
+++ b/configs/grid_rcnn/README.md
@@ -0,0 +1,47 @@
+# Grid R-CNN
+
+> [Grid R-CNN](https://arxiv.org/abs/1811.12030)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+This paper proposes a novel object detection framework named Grid R-CNN, which adopts a grid guided localization mechanism for accurate object detection. Different from the traditional regression based methods, the Grid R-CNN captures the spatial information explicitly and enjoys the position sensitive property of fully convolutional architecture. Instead of using only two independent points, we design a multi-point supervision formulation to encode more clues in order to reduce the impact of inaccurate prediction of specific points. To take the full advantage of the correlation of points in a grid, we propose a two-stage information fusion strategy to fuse feature maps of neighbor grid points. The grid guided localization approach is easy to be extended to different state-of-the-art detection frameworks. Grid R-CNN leads to high quality object localization, and experiments demonstrate that it achieves a 4.1% AP gain at IoU=0.8 and a 10.0% AP gain at IoU=0.9 on COCO benchmark compared to Faster R-CNN with Res50 backbone and FPN architecture.
+
+Grid R-CNN is a well-performed objection detection framework. It transforms the traditional box offset regression problem into a grid point estimation problem. With the guidance of the grid points, it can obtain high-quality localization results. However, the speed of Grid R-CNN is not so satisfactory. In this technical report we present Grid R-CNN Plus, a better and faster version of Grid R-CNN. We have made several updates that significantly speed up the framework and simultaneously improve the accuracy. On COCO dataset, the Res50-FPN based Grid R-CNN Plus detector achieves an mAP of 40.4%, outperforming the baseline on the same model by 3.0 points with similar inference time.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143890379-5d9e6233-0533-48b4-88b9-bc33abbd9f82.png"/>
+</div>
+
+## Results and Models
+
+|  Backbone   | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                            Config                                                             |                                                                                                                                                                         Download                                                                                                                                                                          |
+| :---------: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50     |   2x    |   5.1    |      15.0      |  40.4  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130_221140.log.json)               |
+|    R-101    |   2x    |   7.0    |      12.6      |  41.5  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309-d6eca030.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309_164224.log.json)             |
+| X-101-32x4d |   2x    |   8.3    |      10.8      |  42.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130-d8f0e3ff.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130_215413.log.json) |
+| X-101-64x4d |   2x    |   11.3   |      7.7       |  43.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204-ec76a754.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204_080641.log.json) |
+
+**Notes:**
+
+- All models are trained with 8 GPUs instead of 32 GPUs in the original paper.
+- The warming up lasts for 1 epoch and `2x` here indicates 25 epochs.
+
+## Citation
+
+```latex
+@inproceedings{lu2019grid,
+  title={Grid r-cnn},
+  author={Lu, Xin and Li, Buyu and Yue, Yuxin and Li, Quanquan and Yan, Junjie},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  year={2019}
+}
+
+@article{lu2019grid,
+  title={Grid R-CNN Plus: Faster and Better},
+  author={Lu, Xin and Li, Buyu and Yue, Yuxin and Li, Quanquan and Yan, Junjie},
+  journal={arXiv preprint arXiv:1906.05688},
+  year={2019}
+}
+```
diff --git a/configs/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py b/configs/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py
new file mode 100755
index 0000000..1bb5889
--- /dev/null
+++ b/configs/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './grid_rcnn_r50_fpn_gn-head_2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py b/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py
new file mode 100755
index 0000000..4aa00ec
--- /dev/null
+++ b/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = ['grid_rcnn_r50_fpn_gn-head_2x_coco.py']
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py b/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py
new file mode 100755
index 0000000..df63cd5
--- /dev/null
+++ b/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='GridRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='GridRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            with_reg=False,
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False),
+        grid_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        grid_head=dict(
+            type='GridHead',
+            grid_points=9,
+            num_convs=8,
+            in_channels=256,
+            point_feat_channels=64,
+            norm_cfg=dict(type='GN', num_groups=36),
+            loss_grid=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_radius=1,
+            pos_weight=-1,
+            max_num_grid=192,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.03,
+            nms=dict(type='nms', iou_threshold=0.3),
+            max_per_img=100)))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=3665,
+    warmup_ratio=1.0 / 80,
+    step=[17, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=25)
diff --git a/configs/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py b/configs/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py
new file mode 100755
index 0000000..3bc8516
--- /dev/null
+++ b/configs/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py
@@ -0,0 +1,24 @@
+_base_ = './grid_rcnn_r50_fpn_gn-head_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=3665,
+    warmup_ratio=1.0 / 80,
+    step=[17, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=25)
diff --git a/configs/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py b/configs/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py
new file mode 100755
index 0000000..c78f8f6
--- /dev/null
+++ b/configs/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/grid_rcnn/metafile.yml b/configs/grid_rcnn/metafile.yml
new file mode 100755
index 0000000..d1aa851
--- /dev/null
+++ b/configs/grid_rcnn/metafile.yml
@@ -0,0 +1,101 @@
+Collections:
+  - Name: Grid R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RPN
+        - Dilated Convolution
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1906.05688
+      Title: 'Grid R-CNN'
+    README: configs/grid_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/grid_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: grid_rcnn_r50_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth
+
+  - Name: grid_rcnn_r101_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 79.37
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309-d6eca030.pth
+
+  - Name: grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.3
+      inference time (ms/im):
+        - value: 92.59
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130-d8f0e3ff.pth
+
+  - Name: grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.3
+      inference time (ms/im):
+        - value: 129.87
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204-ec76a754.pth
diff --git a/configs/groie/README.md b/configs/groie/README.md
new file mode 100755
index 0000000..126773f
--- /dev/null
+++ b/configs/groie/README.md
@@ -0,0 +1,72 @@
+# GRoIE
+
+> [A novel Region of Interest Extraction Layer for Instance Segmentation](https://arxiv.org/abs/2004.13665)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Given the wide diffusion of deep neural network architectures for computer vision tasks, several new applications are nowadays more and more feasible. Among them, a particular attention has been recently given to instance segmentation, by exploiting the results achievable by two-stage networks (such as Mask R-CNN or Faster R-CNN), derived from R-CNN. In these complex architectures, a crucial role is played by the Region of Interest (RoI) extraction layer, devoted to extracting a coherent subset of features from a single Feature Pyramid Network (FPN) layer attached on top of a backbone.
+This paper is motivated by the need to overcome the limitations of existing RoI extractors which select only one (the best) layer from FPN. Our intuition is that all the layers of FPN retain useful information. Therefore, the proposed layer (called Generic RoI Extractor - GRoIE) introduces non-local building blocks and attention mechanisms to boost the performance.
+A comprehensive ablation study at component level is conducted to find the best set of algorithms and parameters for the GRoIE layer. Moreover, GRoIE can be integrated seamlessly with every two-stage architecture for both object detection and instance segmentation tasks. Therefore, the improvements brought about by the use of GRoIE in different state-of-the-art architectures are also evaluated. The proposed layer leads up to gain a 1.1% AP improvement on bounding box detection and 1.7% AP improvement on instance segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143891453-afdcdaf4-a868-4a28-ad20-dc710a517a76.png"/>
+</div>
+
+## Introduction
+
+By Leonardo Rossi, Akbar Karimi and Andrea Prati from
+[IMPLab](http://implab.ce.unipr.it/).
+
+We provide configs to reproduce the results in the paper for
+"*A novel Region of Interest Extraction Layer for Instance Segmentation*"
+on COCO object detection.
+
+This paper is motivated by the need to overcome to the limitations of existing
+RoI extractors which select only one (the best) layer from FPN.
+
+Our intuition is that all the layers of FPN retain useful information.
+
+Therefore, the proposed layer (called Generic RoI Extractor - **GRoIE**)
+introduces non-local building blocks and attention mechanisms to boost the
+performance.
+
+## Results and Models
+
+The results on COCO 2017 minival (5k images) are shown in the below table.
+
+### Application of GRoIE to different architectures
+
+| Backbone  |     Method      | Lr schd | box AP | mask AP |                                    Config                                     |                                                                                                                                                                                                                   Download                                                                                                                                                                                                                   |
+| :-------: | :-------------: | :-----: | :----: | :-----: | :---------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN  | Faster Original |   1x    |  37.4  |         |            [config](../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py)            |                                                                  [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json)                                                                   |
+| R-50-FPN  |     + GRoIE     |   1x    |  38.3  |         |               [config](./faster_rcnn_r50_fpn_groie_1x_coco.py)                |                                                         [model](https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715.log.json)                                                         |
+| R-50-FPN  |   Grid R-CNN    |   1x    |  39.1  |         |               [config](./grid_rcnn_r50_fpn_gn-head_1x_coco.py)                |                                             [model](https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco/grid_rcnn_r50_fpn_gn-head_groie_1x_coco_20200605_202059-4b75d86f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco/grid_rcnn_r50_fpn_gn-head_groie_1x_coco_20200605_202059.log.json)                                             |
+| R-50-FPN  |     + GRoIE     |   1x    |        |         |            [config](./grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py)             |                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| R-50-FPN  |   Mask R-CNN    |   1x    |  38.2  |  34.7   |              [config](../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py)              |                                                                        [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json)                                                                         |
+| R-50-FPN  |     + GRoIE     |   1x    |  39.0  |  36.0   |                [config](./mask_rcnn_r50_fpn_groie_1x_coco.py)                 |                                                             [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715-50d90c74.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715.log.json)                                                             |
+| R-50-FPN  |     GC-Net      |   1x    |  40.7  |  36.5   | [config](../gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py)  |                  [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202_085547.log.json)                   |
+| R-50-FPN  |     + GRoIE     |   1x    |  41.0  |  37.8   |  [config](./mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth) |
+| R-101-FPN |     GC-Net      |   1x    |  42.2  |  37.8   | [config](../gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py) |                [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206_142508.log.json)                 |
+| R-101-FPN |     + GRoIE     |   1x    |  42.6  |  38.7   | [config](./mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507-8daae01c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507.log.json) |
+
+## Citation
+
+If you use this work or benchmark in your research, please cite this project.
+
+```latex
+@inproceedings{rossi2021novel,
+  title={A novel region of interest extraction layer for instance segmentation},
+  author={Rossi, Leonardo and Karimi, Akbar and Prati, Andrea},
+  booktitle={2020 25th International Conference on Pattern Recognition (ICPR)},
+  pages={2203--2209},
+  year={2021},
+  organization={IEEE}
+}
+```
+
+## Contact
+
+The implementation of GRoIE is currently maintained by
+[Leonardo Rossi](https://github.com/hachreak/).
diff --git a/configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py b/configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py
new file mode 100755
index 0000000..0fc528b
--- /dev/null
+++ b/configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py
@@ -0,0 +1,25 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py b/configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py
new file mode 100755
index 0000000..8e4b4ab
--- /dev/null
+++ b/configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../grid_rcnn/grid_rcnn_r50_fpn_gn-head_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        grid_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py b/configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
new file mode 100755
index 0000000..8b83722
--- /dev/null
+++ b/configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py b/configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py
new file mode 100755
index 0000000..81dfb48
--- /dev/null
+++ b/configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py b/configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
new file mode 100755
index 0000000..852c5ca
--- /dev/null
+++ b/configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/configs/groie/metafile.yml b/configs/groie/metafile.yml
new file mode 100755
index 0000000..badf53a
--- /dev/null
+++ b/configs/groie/metafile.yml
@@ -0,0 +1,94 @@
+Collections:
+  - Name: GRoIE
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Generic RoI Extractor
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/2004.13665
+      Title: 'A novel Region of Interest Extraction Layer for Instance Segmentation'
+    README: configs/groie/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/roi_extractors/groie.py#L15
+      Version: v2.1.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth
+
+  - Name: grid_rcnn_r50_fpn_gn-head_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco/grid_rcnn_r50_fpn_gn-head_groie_1x_coco_20200605_202059-4b75d86f.pth
+
+  - Name: mask_rcnn_r50_fpn_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715-50d90c74.pth
+
+  - Name: mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:   37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth
+
+  - Name: mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507-8daae01c.pth
diff --git a/configs/guided_anchoring/README.md b/configs/guided_anchoring/README.md
new file mode 100755
index 0000000..563e43f
--- /dev/null
+++ b/configs/guided_anchoring/README.md
@@ -0,0 +1,59 @@
+# Guided Anchoring
+
+> [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Region anchors are the cornerstone of modern object detection techniques. State-of-the-art detectors mostly rely on a dense anchoring scheme, where anchors are sampled uniformly over the spatial domain with a predefined set of scales and aspect ratios. In this paper, we revisit this foundational stage. Our study shows that it can be done much more effectively and efficiently. Specifically, we present an alternative scheme, named Guided Anchoring, which leverages semantic features to guide the anchoring. The proposed method jointly predicts the locations where the center of objects of interest are likely to exist as well as the scales and aspect ratios at different locations. On top of predicted anchor shapes, we mitigate the feature inconsistency with a feature adaption module. We also study the use of high-quality proposals to improve detection performance. The anchoring scheme can be seamlessly integrated into proposal methods and detectors. With Guided Anchoring, we achieve 9.1% higher recall on MS COCO with 90% fewer anchors than the RPN baseline. We also adopt Guided Anchoring in Fast R-CNN, Faster R-CNN and RetinaNet, respectively improving the detection mAP by 2.2%, 2.7% and 1.2%.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143891529-4c178948-c3fd-4543-ae6e-bb2aa3c8147e.png"/>
+</div>
+
+## Results and Models
+
+The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val).
+
+| Method |    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | AR 1000 |                                                          Config                                                           |                                                                                                                                                          Download                                                                                                                                                           |
+| :----: | :-------------: | :-----: | :-----: | :------: | :------------: | :-----: | :-----------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| GA-RPN |    R-50-FPN     |  caffe  |   1x    |   5.3    |      15.8      |  68.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531-899008a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531_011819.log.json)   |
+| GA-RPN |    R-101-FPN    |  caffe  |   1x    |   7.3    |      13.0      |  69.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531-ca9ba8fb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531_011812.log.json) |
+| GA-RPN | X-101-32x4d-FPN | pytorch |   1x    |   8.5    |      10.0      |  70.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220-c28d1b18.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220_221326.log.json) |
+| GA-RPN | X-101-64x4d-FPN | pytorch |   1x    |   7.1    |      7.5       |  71.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225-3c6e1aa2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225_152704.log.json) |
+
+|     Method     |    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                             Config                                                              |                                                                                                                                                                           Download                                                                                                                                                                            |
+| :------------: | :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| GA-Faster RCNN |    R-50-FPN     |  caffe  |   1x    |   5.5    |                |  39.6  |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py)   |          [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718-a11ccfe6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718.log.json)           |
+| GA-Faster RCNN |    R-101-FPN    |  caffe  |   1x    |   7.5    |                |  41.5  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py)   | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_bbox_mAP-0.415_20200505_115528-fb82e499.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_20200505_115528.log.json) |
+| GA-Faster RCNN | X-101-32x4d-FPN | pytorch |   1x    |   8.7    |      9.7       |  43.0  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py)   |            [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215-1ded9da3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215_184547.log.json)            |
+| GA-Faster RCNN | X-101-64x4d-FPN | pytorch |   1x    |   11.8   |      7.3       |  43.9  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py)   |            [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215-0fa7bde7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215_104455.log.json)            |
+|  GA-RetinaNet  |    R-50-FPN     |  caffe  |   1x    |   3.5    |      16.8      |  36.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py)  |        [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020_225450.log.json)        |
+|  GA-RetinaNet  |    R-101-FPN    |  caffe  |   1x    |   5.5    |      12.9      |  39.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531-6266453c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531_012847.log.json)      |
+|  GA-RetinaNet  | X-101-32x4d-FPN | pytorch |   1x    |   6.9    |      10.6      |  40.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219-40c56caa.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219_223025.log.json)      |
+|  GA-RetinaNet  | X-101-64x4d-FPN | pytorch |   1x    |   9.9    |      7.7       |  41.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py) |      [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226-ef9f7f1f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226_221123.log.json)      |
+
+- In the Guided Anchoring paper, `score_thr` is set to 0.001 in Fast/Faster RCNN and 0.05 in RetinaNet for both baselines and Guided Anchoring.
+
+- Performance on COCO test-dev benchmark are shown as follows.
+
+|     Method     | Backbone  | Style | Lr schd | Aug Train | Score thr | AP  | AP_50 | AP_75 | AP_small | AP_medium | AP_large | Download |
+| :------------: | :-------: | :---: | :-----: | :-------: | :-------: | :-: | :---: | :---: | :------: | :-------: | :------: | :------: |
+| GA-Faster RCNN | R-101-FPN | caffe |   1x    |     F     |   0.05    |     |       |       |          |           |          |          |
+| GA-Faster RCNN | R-101-FPN | caffe |   1x    |     F     |   0.001   |     |       |       |          |           |          |          |
+|  GA-RetinaNet  | R-101-FPN | caffe |   1x    |     F     |   0.05    |     |       |       |          |           |          |          |
+|  GA-RetinaNet  | R-101-FPN | caffe |   2x    |     T     |   0.05    |     |       |       |          |           |          |          |
+
+## Citation
+
+We provide config files to reproduce the results in the CVPR 2019 paper for [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278).
+
+```latex
+@inproceedings{wang2019region,
+    title={Region Proposal by Guided Anchoring},
+    author={Jiaqi Wang and Kai Chen and Shuo Yang and Chen Change Loy and Dahua Lin},
+    booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+    year={2019}
+}
+```
diff --git a/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x_coco.py b/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..8fc203c
--- /dev/null
+++ b/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,65 @@
+_base_ = '../fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6),
+            sampler=dict(num=256))),
+    test_cfg=dict(rcnn=dict(score_thr=1e-3)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=300),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=None),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img', 'proposals']),
+        ])
+]
+data = dict(
+    train=dict(
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_train2017.pkl',
+        pipeline=train_pipeline),
+    val=dict(
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline),
+    test=dict(
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py b/configs/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..a40e7c6
--- /dev/null
+++ b/configs/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './ga_faster_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py b/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..b0add92
--- /dev/null
+++ b/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,65 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5),
+        rpn_proposal=dict(nms_post=1000, max_per_img=300),
+        rcnn=dict(
+            assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6),
+            sampler=dict(type='RandomSampler', num=256))),
+    test_cfg=dict(
+        rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/guided_anchoring/ga_faster_r50_fpn_1x_coco.py b/configs/guided_anchoring/ga_faster_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..e3d8238
--- /dev/null
+++ b/configs/guided_anchoring/ga_faster_r50_fpn_1x_coco.py
@@ -0,0 +1,65 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5),
+        rpn_proposal=dict(nms_post=1000, max_per_img=300),
+        rcnn=dict(
+            assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6),
+            sampler=dict(type='RandomSampler', num=256))),
+    test_cfg=dict(
+        rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py b/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..f1dda94
--- /dev/null
+++ b/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga_faster_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py b/configs/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..fb9e2af
--- /dev/null
+++ b/configs/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga_faster_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py b/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..1b1cccd
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './ga_retinanet_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py b/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py
new file mode 100755
index 0000000..260895b
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py
@@ -0,0 +1,169 @@
+_base_ = '../_base_/default_runtime.py'
+
+# model settings
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5),
+    bbox_head=dict(
+        type='GARetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)))
+# training and testing settings
+train_cfg = dict(
+    ga_assigner=dict(
+        type='ApproxMaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.4,
+        min_pos_iou=0.4,
+        ignore_iof_thr=-1),
+    ga_sampler=dict(
+        type='RandomSampler',
+        num=256,
+        pos_fraction=0.5,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=False),
+    assigner=dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.0,
+        ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    center_ratio=0.2,
+    ignore_ratio=0.5,
+    debug=False)
+test_cfg = dict(
+    nms_pre=1000,
+    min_bbox_size=0,
+    score_thr=0.05,
+    nms=dict(type='nms', iou_threshold=0.5),
+    max_per_img=100)
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 480), (1333, 960)],
+        keep_ratio=True,
+        multiscale_mode='range'),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[16, 22])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py b/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..3351201
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,62 @@
+_base_ = '../retinanet/retinanet_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='GARetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.4,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0),
+        center_ratio=0.2,
+        ignore_ratio=0.5))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py b/configs/guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..7694723
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,62 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='GARetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.4,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0),
+        center_ratio=0.2,
+        ignore_ratio=0.5))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py b/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..c5eb34f
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga_retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py b/configs/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..5c69a6f
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga_retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py b/configs/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..039703e
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = './ga_rpn_r50_caffe_fpn_1x_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py b/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..7830894
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,58 @@
+_base_ = '../rpn/rpn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5)),
+    test_cfg=dict(rpn=dict(nms_post=1000)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/guided_anchoring/ga_rpn_r50_fpn_1x_coco.py b/configs/guided_anchoring/ga_rpn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..27ab3e7
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_r50_fpn_1x_coco.py
@@ -0,0 +1,58 @@
+_base_ = '../rpn/rpn_r50_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5)),
+    test_cfg=dict(rpn=dict(nms_post=1000)))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py b/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..cccc985
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga_rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py b/configs/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..4e134d2
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga_rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/guided_anchoring/metafile.yml b/configs/guided_anchoring/metafile.yml
new file mode 100755
index 0000000..f39d183
--- /dev/null
+++ b/configs/guided_anchoring/metafile.yml
@@ -0,0 +1,246 @@
+Collections:
+  - Name: Guided Anchoring
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Guided Anchoring
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1901.03278
+      Title: 'Region Proposal by Guided Anchoring'
+    README: configs/guided_anchoring/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/ga_retina_head.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: ga_rpn_r50_caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.3
+      inference time (ms/im):
+        - value: 63.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 68.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531-899008a6.pth
+
+  - Name: ga_rpn_r101_caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      inference time (ms/im):
+        - value: 76.92
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 69.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531-ca9ba8fb.pth
+
+  - Name: ga_rpn_x101_32x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.5
+      inference time (ms/im):
+        - value: 100
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 70.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220-c28d1b18.pth
+
+  - Name: ga_rpn_x101_64x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 133.33
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 70.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225-3c6e1aa2.pth
+
+  - Name: ga_faster_r50_caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718-a11ccfe6.pth
+
+  - Name: ga_faster_r101_caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_bbox_mAP-0.415_20200505_115528-fb82e499.pth
+
+  - Name: ga_faster_x101_32x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.7
+      inference time (ms/im):
+        - value: 103.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215-1ded9da3.pth
+
+  - Name: ga_faster_x101_64x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 11.8
+      inference time (ms/im):
+        - value: 136.99
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215-0fa7bde7.pth
+
+  - Name: ga_retinanet_r50_caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      inference time (ms/im):
+        - value: 59.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth
+
+  - Name: ga_retinanet_r101_caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531-6266453c.pth
+
+  - Name: ga_retinanet_x101_32x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.9
+      inference time (ms/im):
+        - value: 94.34
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219-40c56caa.pth
+
+  - Name: ga_retinanet_x101_64x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 129.87
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226-ef9f7f1f.pth
diff --git a/configs/hrnet/README.md b/configs/hrnet/README.md
new file mode 100755
index 0000000..e340c78
--- /dev/null
+++ b/configs/hrnet/README.md
@@ -0,0 +1,101 @@
+# HRNet
+
+> [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1902.09212)
+
+<!-- [BACKBONE] -->
+
+## Abstract
+
+This is an official pytorch implementation of Deep High-Resolution Representation Learning for Human Pose Estimation. In this work, we are interested in the human pose estimation problem with a focus on learning reliable high-resolution representations. Most existing methods recover high-resolution representations from low-resolution representations produced by a high-to-low resolution network. Instead, our proposed network maintains high-resolution representations through the whole process. We start from a high-resolution subnetwork as the first stage, gradually add high-to-low resolution subnetworks one by one to form more stages, and connect the mutli-resolution subnetworks in parallel. We conduct repeated multi-scale fusions such that each of the high-to-low resolution representations receives information from other parallel representations over and over, leading to rich high-resolution representations. As a result, the predicted keypoint heatmap is potentially more accurate and spatially more precise. We empirically demonstrate the effectiveness of our network through the superior pose estimation results over two benchmark datasets: the COCO keypoint detection dataset and the MPII Human Pose dataset.
+
+High-resolution representation learning plays an essential role in many vision problems, e.g., pose estimation and semantic segmentation. The high-resolution network (HRNet), recently developed for human pose estimation, maintains high-resolution representations through the whole process by connecting high-to-low resolution convolutions in parallel and produces strong high-resolution representations by repeatedly conducting fusions across parallel convolutions.
+In this paper, we conduct a further study on high-resolution representations by introducing a simple yet effective modification and apply it to a wide range of vision tasks. We augment the high-resolution representation by aggregating the (upsampled) representations from all the parallel convolutions rather than only the representation from the high-resolution convolution as done in HRNet. This simple modification leads to stronger representations, evidenced by superior results. We show top results in semantic segmentation on Cityscapes, LIP, and PASCAL Context, and facial landmark detection on AFLW, COFW, 300W, and WFLW. In addition, we build a multi-level representation from the high-resolution representation and apply it to the Faster R-CNN object detection framework and the extended frameworks. The proposed approach achieves superior results to existing single-model networks on COCO object detection.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143892740-a4e9743e-a323-4ace-8025-50e251ef43ff.png"/>
+</div>
+
+## Results and Models
+
+### Faster R-CNN
+
+|   Backbone   |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                      Config                                                       |                                                                                                                                                         Download                                                                                                                                                         |
+| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HRNetV2p-W18 | pytorch |   1x    |   6.6    |      13.4      |  36.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130_211246.log.json)     |
+| HRNetV2p-W18 | pytorch |   2x    |   6.6    |       -        |  38.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731-a4ec0611.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731.log.json) |
+| HRNetV2p-W32 | pytorch |   1x    |   9.0    |      12.4      |  40.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130-6e286425.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130_204442.log.json)     |
+| HRNetV2p-W32 | pytorch |   2x    |   9.0    |       -        |  41.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927-976a9c15.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927.log.json) |
+| HRNetV2p-W40 | pytorch |   1x    |   10.4   |      10.5      |  41.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210-95c1f5ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210_125315.log.json)     |
+| HRNetV2p-W40 | pytorch |   2x    |   10.4   |       -        |  42.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033-0f236ef4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033.log.json) |
+
+### Mask R-CNN
+
+|   Backbone   |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                     Config                                                      |                                                                                                                                                     Download                                                                                                                                                     |
+| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HRNetV2p-W18 | pytorch |   1x    |   7.0    |      11.7      |  37.7  |  34.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205-1c3d78ed.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205_232523.log.json)     |
+| HRNetV2p-W18 | pytorch |   2x    |   7.0    |       -        |  39.8  |  36.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212-b3c825b1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212_134222.log.json)     |
+| HRNetV2p-W32 | pytorch |   1x    |   9.4    |      11.3      |  41.2  |  37.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207-b29f616e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207_055017.log.json)     |
+| HRNetV2p-W32 | pytorch |   2x    |   9.4    |       -        |  42.5  |  37.8   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213-45b75b4d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213_150518.log.json)     |
+| HRNetV2p-W40 | pytorch |   1x    |   10.9   |                |  42.1  |  37.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646-66738b35.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646.log.json) |
+| HRNetV2p-W40 | pytorch |   2x    |   10.9   |                |  42.8  |  38.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732-aed5e4ab.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732.log.json) |
+
+### Cascade R-CNN
+
+|   Backbone   |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                       Config                                                        |                                                                                                                                                             Download                                                                                                                                                             |
+| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HRNetV2p-W18 | pytorch |   20e   |   7.0    |      11.0      |  41.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210-434be9d7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210_105632.log.json)     |
+| HRNetV2p-W32 | pytorch |   20e   |   9.4    |      11.0      |  43.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208-928455a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208_160511.log.json)     |
+| HRNetV2p-W40 | pytorch |   20e   |   10.8   |                |  43.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112-75e47b04.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112.log.json) |
+
+### Cascade Mask R-CNN
+
+|   Backbone   |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                          Config                                                          |                                                                                                                                                                       Download                                                                                                                                                                       |
+| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HRNetV2p-W18 | pytorch |   20e   |   8.5    |      8.5       |  41.6  |  36.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210_093149.log.json)     |
+| HRNetV2p-W32 | pytorch |   20e   |          |      8.3       |  44.3  |  38.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043-39d9cf7b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043.log.json) |
+| HRNetV2p-W40 | pytorch |   20e   |   12.5   |                |  45.1  |  39.3   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922-969c4610.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922.log.json) |
+
+### Hybrid Task Cascade (HTC)
+
+|   Backbone   |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                   Config                                                   |                                                                                                                                           Download                                                                                                                                           |
+| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HRNetV2p-W18 | pytorch |   20e   |   10.8   |      4.7       |  42.8  |  37.9   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/htc_hrnetv2p_w18_20e_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210-b266988c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210_182735.log.json)     |
+| HRNetV2p-W32 | pytorch |   20e   |   13.1   |      4.9       |  45.4  |  39.9   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/htc_hrnetv2p_w32_20e_coco.py) |    [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207-7639fa12.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207_193153.log.json)     |
+| HRNetV2p-W40 | pytorch |   20e   |   14.6   |                |  46.4  |  40.8   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/htc_hrnetv2p_w40_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411-417c4d5b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411.log.json) |
+
+### FCOS
+
+|   Backbone   |  Style  | GN  | MS train | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                 Config                                                                 |                                                                                                                                                                                                   Download                                                                                                                                                                                                   |
+| :----------: | :-----: | :-: | :------: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| HRNetV2p-W18 | pytorch |  Y  |    N     |   1x    |   13.0   |      12.9      |  35.3  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py)         |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710-4ad151de.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710.log.json)                                 |
+| HRNetV2p-W18 | pytorch |  Y  |    N     |   2x    |   13.0   |       -        |  38.2  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py)         |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110-5c575fa5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110.log.json)                                 |
+| HRNetV2p-W32 | pytorch |  Y  |    N     |   1x    |   17.5   |      12.9      |  39.5  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py)         |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730-cb8055c0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730.log.json)                                 |
+| HRNetV2p-W32 | pytorch |  Y  |    N     |   2x    |   17.5   |       -        |  40.8  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py)         |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133-77b6b9bb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133.log.json)                                 |
+| HRNetV2p-W18 | pytorch |  Y  |    Y     |   2x    |   13.0   |      12.9      |  38.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651-441e9d9f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651.log.json) |
+| HRNetV2p-W32 | pytorch |  Y  |    Y     |   2x    |   17.5   |      12.4      |  41.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846-b6f2b49f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846.log.json) |
+| HRNetV2p-W48 | pytorch |  Y  |    Y     |   2x    |   20.3   |      10.8      |  42.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752-f22d2ce5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752.log.json) |
+
+**Note:**
+
+- The `28e` schedule in HTC indicates decreasing the lr at 24 and 27 epochs, with a total of 28 epochs.
+- HRNetV2 ImageNet pretrained models are in [HRNets for Image Classification](https://github.com/HRNet/HRNet-Image-Classification).
+
+## Citation
+
+```latex
+@inproceedings{SunXLW19,
+  title={Deep High-Resolution Representation Learning for Human Pose Estimation},
+  author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang},
+  booktitle={CVPR},
+  year={2019}
+}
+
+@article{SunZJCXLMWLW19,
+  title={High-Resolution Representations for Labeling Pixels and Regions},
+  author={Ke Sun and Yang Zhao and Borui Jiang and Tianheng Cheng and Bin Xiao
+  and Dong Liu and Yadong Mu and Xinggang Wang and Wenyu Liu and Jingdong Wang},
+  journal   = {CoRR},
+  volume    = {abs/1904.04514},
+  year={2019}
+}
+```
diff --git a/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py b/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
new file mode 100755
index 0000000..839cf3e
--- /dev/null
+++ b/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py b/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py
new file mode 100755
index 0000000..9942602
--- /dev/null
+++ b/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py
@@ -0,0 +1,40 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py b/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py
new file mode 100755
index 0000000..10d5e83
--- /dev/null
+++ b/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py
@@ -0,0 +1,12 @@
+_base_ = './cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/configs/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py b/configs/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py
new file mode 100755
index 0000000..ebd5e20
--- /dev/null
+++ b/configs/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './cascade_rcnn_hrnetv2p_w32_20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/configs/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py b/configs/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py
new file mode 100755
index 0000000..e7f89a9
--- /dev/null
+++ b/configs/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py
@@ -0,0 +1,40 @@
+_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py b/configs/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py
new file mode 100755
index 0000000..265e8d6
--- /dev/null
+++ b/configs/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py
@@ -0,0 +1,12 @@
+_base_ = './cascade_rcnn_hrnetv2p_w32_20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py b/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py
new file mode 100755
index 0000000..1df2c3d
--- /dev/null
+++ b/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './faster_rcnn_hrnetv2p_w32_1x_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py b/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py
new file mode 100755
index 0000000..a4b987a
--- /dev/null
+++ b/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py
@@ -0,0 +1,5 @@
+_base_ = './faster_rcnn_hrnetv2p_w18_1x_coco.py'
+
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py b/configs/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py
new file mode 100755
index 0000000..be05809
--- /dev/null
+++ b/configs/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py
@@ -0,0 +1,37 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py b/configs/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py
new file mode 100755
index 0000000..63c8717
--- /dev/null
+++ b/configs/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './faster_rcnn_hrnetv2p_w32_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py b/configs/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py
new file mode 100755
index 0000000..886a7c9
--- /dev/null
+++ b/configs/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './faster_rcnn_hrnetv2p_w32_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py b/configs/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py
new file mode 100755
index 0000000..585cc2c
--- /dev/null
+++ b/configs/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './faster_rcnn_hrnetv2p_w40_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py b/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py
new file mode 100755
index 0000000..fd662bd
--- /dev/null
+++ b/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py
@@ -0,0 +1,10 @@
+_base_ = './fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py b/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py
new file mode 100755
index 0000000..3497595
--- /dev/null
+++ b/configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py
new file mode 100755
index 0000000..37bfdae
--- /dev/null
+++ b/configs/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py
@@ -0,0 +1,10 @@
+_base_ = './fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py b/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py
new file mode 100755
index 0000000..10617f2
--- /dev/null
+++ b/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py
@@ -0,0 +1,70 @@
+_base_ = '../fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256,
+        stride=2,
+        num_outs=5))
+img_norm_cfg = dict(
+    mean=[103.53, 116.28, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py b/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py
new file mode 100755
index 0000000..7b38130
--- /dev/null
+++ b/configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py
new file mode 100755
index 0000000..482f887
--- /dev/null
+++ b/configs/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py
@@ -0,0 +1,39 @@
+_base_ = './fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[103.53, 116.28, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py b/configs/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py
new file mode 100755
index 0000000..0ae9dbe
--- /dev/null
+++ b/configs/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/configs/hrnet/htc_hrnetv2p_w18_20e_coco.py b/configs/hrnet/htc_hrnetv2p_w18_20e_coco.py
new file mode 100755
index 0000000..3c2eb1d
--- /dev/null
+++ b/configs/hrnet/htc_hrnetv2p_w18_20e_coco.py
@@ -0,0 +1,10 @@
+_base_ = './htc_hrnetv2p_w32_20e_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/configs/hrnet/htc_hrnetv2p_w32_20e_coco.py b/configs/hrnet/htc_hrnetv2p_w32_20e_coco.py
new file mode 100755
index 0000000..545cb83
--- /dev/null
+++ b/configs/hrnet/htc_hrnetv2p_w32_20e_coco.py
@@ -0,0 +1,37 @@
+_base_ = '../htc/htc_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
diff --git a/configs/hrnet/htc_hrnetv2p_w40_20e_coco.py b/configs/hrnet/htc_hrnetv2p_w40_20e_coco.py
new file mode 100755
index 0000000..94bff1b
--- /dev/null
+++ b/configs/hrnet/htc_hrnetv2p_w40_20e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './htc_hrnetv2p_w32_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/configs/hrnet/htc_hrnetv2p_w40_28e_coco.py b/configs/hrnet/htc_hrnetv2p_w40_28e_coco.py
new file mode 100755
index 0000000..7067e8b
--- /dev/null
+++ b/configs/hrnet/htc_hrnetv2p_w40_28e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './htc_hrnetv2p_w40_20e_coco.py'
+# learning policy
+lr_config = dict(step=[24, 27])
+runner = dict(type='EpochBasedRunner', max_epochs=28)
diff --git a/configs/hrnet/htc_x101_64x4d_fpn_16x1_28e_coco.py b/configs/hrnet/htc_x101_64x4d_fpn_16x1_28e_coco.py
new file mode 100755
index 0000000..815f285
--- /dev/null
+++ b/configs/hrnet/htc_x101_64x4d_fpn_16x1_28e_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../htc/htc_x101_64x4d_fpn_16x1_20e_coco.py'
+# learning policy
+lr_config = dict(step=[24, 27])
+runner = dict(type='EpochBasedRunner', max_epochs=28)
diff --git a/configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py b/configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py
new file mode 100755
index 0000000..cb12200
--- /dev/null
+++ b/configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py
@@ -0,0 +1,10 @@
+_base_ = './mask_rcnn_hrnetv2p_w32_1x_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/configs/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py b/configs/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py
new file mode 100755
index 0000000..ca62682
--- /dev/null
+++ b/configs/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_hrnetv2p_w18_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py b/configs/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py
new file mode 100755
index 0000000..d5f0eb5
--- /dev/null
+++ b/configs/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py
@@ -0,0 +1,37 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
diff --git a/configs/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py b/configs/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py
new file mode 100755
index 0000000..63d5d13
--- /dev/null
+++ b/configs/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_hrnetv2p_w32_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py b/configs/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py
new file mode 100755
index 0000000..5a76f4b
--- /dev/null
+++ b/configs/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './mask_rcnn_hrnetv2p_w18_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/configs/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py b/configs/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py
new file mode 100755
index 0000000..3a2a510
--- /dev/null
+++ b/configs/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_hrnetv2p_w40_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/hrnet/metafile.yml b/configs/hrnet/metafile.yml
new file mode 100755
index 0000000..ac36efa
--- /dev/null
+++ b/configs/hrnet/metafile.yml
@@ -0,0 +1,971 @@
+Models:
+  - Name: faster_rcnn_hrnetv2p_w18_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.6
+      inference time (ms/im):
+        - value: 74.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster_rcnn_hrnetv2p_w18_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.6
+      inference time (ms/im):
+        - value: 74.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731-a4ec0611.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster_rcnn_hrnetv2p_w32_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      inference time (ms/im):
+        - value: 80.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130-6e286425.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster_rcnn_hrnetv2p_w32_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      inference time (ms/im):
+        - value: 80.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927-976a9c15.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster_rcnn_hrnetv2p_w40_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.4
+      inference time (ms/im):
+        - value: 95.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210-95c1f5ce.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster_rcnn_hrnetv2p_w40_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.4
+      inference time (ms/im):
+        - value: 95.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033-0f236ef4.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask_rcnn_hrnetv2p_w18_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205-1c3d78ed.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask_rcnn_hrnetv2p_w18_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212-b3c825b1.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask_rcnn_hrnetv2p_w32_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.4
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207-b29f616e.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask_rcnn_hrnetv2p_w32_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco.py
+    Metadata:
+      Training Memory (GB): 9.4
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213-45b75b4d.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask_rcnn_hrnetv2p_w40_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.9
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646-66738b35.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask_rcnn_hrnetv2p_w40_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.9
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732-aed5e4ab.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade_rcnn_hrnetv2p_w18_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210-434be9d7.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade_rcnn_hrnetv2p_w32_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco.py
+    Metadata:
+      Training Memory (GB): 9.4
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208-928455a4.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade_rcnn_hrnetv2p_w40_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112-75e47b04.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade_mask_rcnn_hrnetv2p_w18_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
+    Metadata:
+      Training Memory (GB): 8.5
+      inference time (ms/im):
+        - value: 117.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade_mask_rcnn_hrnetv2p_w32_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 120.48
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043-39d9cf7b.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade_mask_rcnn_hrnetv2p_w40_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco.py
+    Metadata:
+      Training Memory (GB): 12.5
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922-969c4610.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: htc_hrnetv2p_w18_20e_coco
+    In Collection: HTC
+    Config: configs/hrnet/htc_hrnetv2p_w18_20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      inference time (ms/im):
+        - value: 212.77
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210-b266988c.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: htc_hrnetv2p_w32_20e_coco
+    In Collection: HTC
+    Config: configs/hrnet/htc_hrnetv2p_w32_20e_coco.py
+    Metadata:
+      Training Memory (GB): 13.1
+      inference time (ms/im):
+        - value: 204.08
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207-7639fa12.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: htc_hrnetv2p_w40_20e_coco
+    In Collection: HTC
+    Config: configs/hrnet/htc_hrnetv2p_w40_20e_coco.py
+    Metadata:
+      Training Memory (GB): 14.6
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411-417c4d5b.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p_w18_gn-head_4x4_1x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 13.0
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710-4ad151de.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p_w18_gn-head_4x4_2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 13.0
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110-5c575fa5.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p_w32_gn-head_4x4_1x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 17.5
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730-cb8055c0.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p_w32_gn-head_4x4_2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 17.5
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133-77b6b9bb.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 13.0
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651-441e9d9f.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 17.5
+      inference time (ms/im):
+        - value: 80.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846-b6f2b49f.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 20.3
+      inference time (ms/im):
+        - value: 92.59
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752-f22d2ce5.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
diff --git a/configs/htc/README.md b/configs/htc/README.md
new file mode 100755
index 0000000..747f8f6
--- /dev/null
+++ b/configs/htc/README.md
@@ -0,0 +1,67 @@
+# HTC
+
+> [Hybrid Task Cascade for Instance Segmentation](https://arxiv.org/abs/1901.07518)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Cascade is a classic yet powerful architecture that has boosted performance on various tasks. However, how to introduce cascade to instance segmentation remains an open question. A simple combination of Cascade R-CNN and Mask R-CNN only brings limited gain. In exploring a more effective approach, we find that the key to a successful instance segmentation cascade is to fully leverage the reciprocal relationship between detection and segmentation. In this work, we propose a new framework, Hybrid Task Cascade (HTC), which differs in two important aspects: (1) instead of performing cascaded refinement on these two tasks separately, it interweaves them for a joint multi-stage processing; (2) it adopts a fully convolutional branch to provide spatial context, which can help distinguishing hard foreground from cluttered background. Overall, this framework can learn more discriminative features progressively while integrating complementary features together in each stage. Without bells and whistles, a single HTC obtains 38.4 and 1.5 improvement over a strong Cascade Mask R-CNN baseline on MSCOCO dataset. Moreover, our overall system achieves 48.6 mask AP on the test-challenge split, ranking 1st in the COCO 2018 Challenge Object Detection Task.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143893906-e05acaa6-d46f-4c11-84e8-bb9940a95b44.png"/>
+</div>
+
+## Introduction
+
+HTC requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path.
+The directory should be like this.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+|   |   ├── stuffthingmaps
+```
+
+## Results and Models
+
+The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val)
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                     Config                                                      |                                                                                                                                                   Download                                                                                                                                                    |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |   1x    |   8.2    |      5.8       |  42.3  |  37.4   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_r50_fpn_1x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317_070435.log.json)                           |
+|    R-50-FPN     | pytorch |   20e   |   8.2    |       -        |  43.3  |  38.3   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_r50_fpn_20e_coco.py)       |                         [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319_070313.log.json)                         |
+|    R-101-FPN    | pytorch |   20e   |   10.2   |      5.5       |  44.8  |  39.6   |      [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_r101_fpn_20e_coco.py)       |                       [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317-9b41b48f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317_153107.log.json)                       |
+| X-101-32x4d-FPN | pytorch |   20e   |   11.4   |      5.0       |  46.1  |  40.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318-de97ae01.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318_034519.log.json) |
+| X-101-64x4d-FPN | pytorch |   20e   |   14.5   |      4.4       |  47.0  |  41.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318-b181fd7a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318_081711.log.json) |
+
+- In the HTC paper and COCO 2018 Challenge, `score_thr` is set to 0.001 for both baselines and HTC.
+- We use 8 GPUs with 2 images/GPU for R-50 and R-101 models, and 16 GPUs with 1 image/GPU for X-101 models.
+  If you would like to train X-101 HTC with 8 GPUs, you need to change the lr from 0.02 to 0.01.
+
+We also provide a powerful HTC with DCN and multi-scale training model. No testing augmentation is used.
+
+|    Backbone     |  Style  |  DCN  | training scales | Lr schd | box AP | mask AP |                                                                    Config                                                                    |                                                                                                                                                                                                             Download                                                                                                                                                                                                              |
+| :-------------: | :-----: | :---: | :-------------: | :-----: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| X-101-64x4d-FPN | pytorch | c3-c5 |    400~1400     |   20e   |  50.4  |  43.8   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312_203410.log.json) |
+
+## Citation
+
+We provide config files to reproduce the results in the CVPR 2019 paper for [Hybrid Task Cascade](https://arxiv.org/abs/1901.07518).
+
+```latex
+@inproceedings{chen2019hybrid,
+  title={Hybrid task cascade for instance segmentation},
+  author={Chen, Kai and Pang, Jiangmiao and Wang, Jiaqi and Xiong, Yu and Li, Xiaoxiao and Sun, Shuyang and Feng, Wansen and Liu, Ziwei and Shi, Jianping and Ouyang, Wanli and Chen Change Loy and Dahua Lin},
+  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+  year={2019}
+}
+```
diff --git a/configs/htc/htc_r101_fpn_20e_coco.py b/configs/htc/htc_r101_fpn_20e_coco.py
new file mode 100755
index 0000000..b42297b
--- /dev/null
+++ b/configs/htc/htc_r101_fpn_20e_coco.py
@@ -0,0 +1,9 @@
+_base_ = './htc_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/htc/htc_r50_fpn_1x_coco.py b/configs/htc/htc_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..1e8e18a
--- /dev/null
+++ b/configs/htc/htc_r50_fpn_1x_coco.py
@@ -0,0 +1,56 @@
+_base_ = './htc_without_semantic_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        semantic_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[8]),
+        semantic_head=dict(
+            type='FusedSemanticHead',
+            num_ins=5,
+            fusion_level=1,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=183,
+            loss_seg=dict(
+                type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2))))
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 8),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(
+        seg_prefix=data_root + 'stuffthingmaps/train2017/',
+        pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/htc/htc_r50_fpn_20e_coco.py b/configs/htc/htc_r50_fpn_20e_coco.py
new file mode 100755
index 0000000..7d2e011
--- /dev/null
+++ b/configs/htc/htc_r50_fpn_20e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './htc_r50_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/htc/htc_without_semantic_r50_fpn_1x_coco.py b/configs/htc/htc_without_semantic_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..565104f
--- /dev/null
+++ b/configs/htc/htc_without_semantic_r50_fpn_1x_coco.py
@@ -0,0 +1,236 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='HybridTaskCascade',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='HybridTaskCascadeRoIHead',
+        interleaved=True,
+        mask_info_flow=True,
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=[
+            dict(
+                type='HTCMaskHead',
+                with_conv_res=False,
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=80,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+            dict(
+                type='HTCMaskHead',
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=80,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+            dict(
+                type='HTCMaskHead',
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=80,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))
+        ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.001,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
diff --git a/configs/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py b/configs/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py
new file mode 100755
index 0000000..0c834f2
--- /dev/null
+++ b/configs/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py
@@ -0,0 +1,19 @@
+_base_ = './htc_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
+data = dict(samples_per_gpu=1, workers_per_gpu=1)
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py b/configs/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py
new file mode 100755
index 0000000..8b0d962
--- /dev/null
+++ b/configs/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py
@@ -0,0 +1,19 @@
+_base_ = './htc_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
+data = dict(samples_per_gpu=1, workers_per_gpu=1)
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py b/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py
new file mode 100755
index 0000000..c8d8703
--- /dev/null
+++ b/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py
@@ -0,0 +1,43 @@
+_base_ = './htc_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(
+        type='Resize',
+        img_scale=[(1600, 400), (1600, 1400)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 8),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+data = dict(
+    samples_per_gpu=1, workers_per_gpu=1, train=dict(pipeline=train_pipeline))
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/htc/metafile.yml b/configs/htc/metafile.yml
new file mode 100755
index 0000000..acd038c
--- /dev/null
+++ b/configs/htc/metafile.yml
@@ -0,0 +1,165 @@
+Collections:
+  - Name: HTC
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - HTC
+        - RPN
+        - ResNet
+        - ResNeXt
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1901.07518
+      Title: 'Hybrid Task Cascade for Instance Segmentation'
+    README: configs/htc/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/htc.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: htc_r50_fpn_1x_coco
+    In Collection: HTC
+    Config: configs/htc/htc_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.2
+      inference time (ms/im):
+        - value: 172.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth
+
+  - Name: htc_r50_fpn_20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 8.2
+      inference time (ms/im):
+        - value: 172.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth
+
+  - Name: htc_r101_fpn_20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.2
+      inference time (ms/im):
+        - value: 181.82
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317-9b41b48f.pth
+
+  - Name: htc_x101_32x4d_fpn_16x1_20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_x101_32x4d_fpn_16x1_20e_coco.py
+    Metadata:
+      Training Resources: 16x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 11.4
+      inference time (ms/im):
+        - value: 200
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318-de97ae01.pth
+
+  - Name: htc_x101_64x4d_fpn_16x1_20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_x101_64x4d_fpn_16x1_20e_coco.py
+    Metadata:
+      Training Resources: 16x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 14.5
+      inference time (ms/im):
+        - value: 227.27
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318-b181fd7a.pth
+
+  - Name: htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py
+    Metadata:
+      Training Resources: 16x V100 GPUs
+      Batch Size: 16
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth
diff --git a/configs/instaboost/README.md b/configs/instaboost/README.md
new file mode 100755
index 0000000..82ed334
--- /dev/null
+++ b/configs/instaboost/README.md
@@ -0,0 +1,58 @@
+# Instaboost
+
+> [Instaboost: Boosting instance segmentation via probability map guided copy-pasting](https://arxiv.org/abs/1908.07801)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Instance segmentation requires a large number of training samples to achieve satisfactory performance and benefits from proper data augmentation. To enlarge the training set and increase the diversity, previous methods have investigated using data annotation from other domain (e.g. bbox, point) in a weakly supervised mechanism. In this paper, we present a simple, efficient and effective method to augment the training set using the existing instance mask annotations. Exploiting the pixel redundancy of the background, we are able to improve the performance of Mask R-CNN for 1.7 mAP on COCO dataset and 3.3 mAP on Pascal VOC dataset by simply introducing random jittering to objects. Furthermore, we propose a location probability map based approach to explore the feasible locations that objects can be placed based on local appearance similarity. With the guidance of such map, we boost the performance of R101-Mask R-CNN on instance segmentation from 35.7 mAP to 37.9 mAP without modifying the backbone or network structure. Our method is simple to implement and does not increase the computational complexity. It can be integrated into the training pipeline of any instance segmentation model without affecting the training and inference efficiency.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143894053-ecfe8406-1a05-461b-953c-efeef22d7a60.png" height="300"/>
+</div>
+
+## Introduction
+
+Configs in this directory is the implementation for ICCV2019 paper "InstaBoost: Boosting Instance Segmentation Via Probability Map Guided Copy-Pasting" and provided by the authors of the paper. InstaBoost is a data augmentation method for object detection and instance segmentation. The paper has been released on [`arXiv`](https://arxiv.org/abs/1908.07801).
+
+## Usage
+
+### Requirements
+
+You need to install `instaboostfast` before using it.
+
+```shell
+pip install instaboostfast
+```
+
+The code and more details can be found [here](https://github.com/GothicAi/Instaboost).
+
+### Integration with MMDetection
+
+InstaBoost have been already integrated in the data pipeline, thus all you need is to add or change **InstaBoost** configurations after **LoadImageFromFile**. We have provided examples like [this](mask_rcnn_r50_fpn_instaboost_4x#L121). You can refer to [`InstaBoostConfig`](https://github.com/GothicAi/InstaBoost-pypi#instaboostconfig) for more details.
+
+## Results and Models
+
+- All models were trained on `coco_2017_train` and tested on `coco_2017_val` for convenience of evaluation and comparison. In the paper, the results are obtained from `test-dev`.
+- To balance accuracy and training time when using InstaBoost, models released in this page are all trained for 48 Epochs. Other training and testing configs strictly follow the original framework.
+- For results and models in MMDetection V1.x, please refer to [Instaboost](https://github.com/GothicAi/Instaboost).
+
+|    Network    |    Backbone     | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                               Config                                                               |                                                                                                                                                                                    Download                                                                                                                                                                                    |
+| :-----------: | :-------------: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Mask R-CNN   |    R-50-FPN     |   4x    |   4.4    |      17.5      |  40.6  |  36.6   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py)     |                  [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307_223635.log.json)                   |
+|  Mask R-CNN   |    R-101-FPN    |   4x    |   6.4    |                |  42.5  |  38.0   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py)     |             [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738-f23f3a5f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738.log.json)             |
+|  Mask R-CNN   | X-101-64x4d-FPN |   4x    |   10.7   |                |  44.7  |  39.7   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947-8ed58c1b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947.log.json) |
+| Cascade R-CNN |    R-101-FPN    |   4x    |   6.0    |      12.0      |  43.7  |  38.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py) |  [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-c19d98d9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307_223646.log.json)   |
+
+## Citation
+
+```latex
+@inproceedings{fang2019instaboost,
+  title={Instaboost: Boosting instance segmentation via probability map guided copy-pasting},
+  author={Fang, Hao-Shu and Sun, Jianhua and Wang, Runzhong and Gou, Minghao and Li, Yong-Lu and Lu, Cewu},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={682--691},
+  year={2019}
+}
+```
diff --git a/configs/instaboost/cascade_mask_rcnn_r101_fpn_instaboost_4x_coco.py b/configs/instaboost/cascade_mask_rcnn_r101_fpn_instaboost_4x_coco.py
new file mode 100755
index 0000000..9d0515d
--- /dev/null
+++ b/configs/instaboost/cascade_mask_rcnn_r101_fpn_instaboost_4x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py b/configs/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py
new file mode 100755
index 0000000..a89a81f
--- /dev/null
+++ b/configs/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='InstaBoost',
+        action_candidate=('normal', 'horizontal', 'skip'),
+        action_prob=(1, 0, 0),
+        scale=(0.8, 1.2),
+        dx=15,
+        dy=15,
+        theta=(-1, 1),
+        color_prob=0.5,
+        hflag=False,
+        aug_ratio=0.5),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+# learning policy
+lr_config = dict(step=[32, 44])
+runner = dict(type='EpochBasedRunner', max_epochs=48)
diff --git a/configs/instaboost/cascade_mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py b/configs/instaboost/cascade_mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py
new file mode 100755
index 0000000..d67b799
--- /dev/null
+++ b/configs/instaboost/cascade_mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py b/configs/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py
new file mode 100755
index 0000000..ebbb43e
--- /dev/null
+++ b/configs/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_instaboost_4x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py b/configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py
new file mode 100755
index 0000000..55ca62b
--- /dev/null
+++ b/configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='InstaBoost',
+        action_candidate=('normal', 'horizontal', 'skip'),
+        action_prob=(1, 0, 0),
+        scale=(0.8, 1.2),
+        dx=15,
+        dy=15,
+        theta=(-1, 1),
+        color_prob=0.5,
+        hflag=False,
+        aug_ratio=0.5),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+# learning policy
+lr_config = dict(step=[32, 44])
+runner = dict(type='EpochBasedRunner', max_epochs=48)
diff --git a/configs/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py b/configs/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py
new file mode 100755
index 0000000..2010f44
--- /dev/null
+++ b/configs/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_r50_fpn_instaboost_4x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/instaboost/metafile.yml b/configs/instaboost/metafile.yml
new file mode 100755
index 0000000..325283d
--- /dev/null
+++ b/configs/instaboost/metafile.yml
@@ -0,0 +1,99 @@
+Collections:
+  - Name: InstaBoost
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - InstaBoost
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+    Paper:
+      URL: https://arxiv.org/abs/1908.07801
+      Title: 'Instaboost: Boosting instance segmentation via probability map guided copy-pasting'
+    README: configs/instaboost/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/datasets/pipelines/instaboost.py#L7
+      Version: v2.0.0
+
+Models:
+  - Name: mask_rcnn_r50_fpn_instaboost_4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 57.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth
+
+  - Name: mask_rcnn_r101_fpn_instaboost_4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738-f23f3a5f.pth
+
+  - Name: mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947-8ed58c1b.pth
+
+  - Name: cascade_mask_rcnn_r50_fpn_instaboost_4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 83.33
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-c19d98d9.pth
diff --git a/configs/lad/README.md b/configs/lad/README.md
new file mode 100755
index 0000000..f2b7c20
--- /dev/null
+++ b/configs/lad/README.md
@@ -0,0 +1,44 @@
+# LAD
+
+> [Improving Object Detection by Label Assignment Distillation](https://arxiv.org/abs/2108.10520)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Label assignment in object detection aims to assign targets, foreground or background, to sampled regions in an image. Unlike labeling for image classification, this problem is not well defined due to the object's bounding box. In this paper, we investigate the problem from a perspective of distillation, hence we call Label Assignment Distillation (LAD). Our initial motivation is very simple, we use a teacher network to generate labels for the student. This can be achieved in two ways: either using the teacher's prediction as the direct targets (soft label), or through the hard labels dynamically assigned by the teacher (LAD). Our experiments reveal that: (i) LAD is more effective than soft-label, but they are complementary. (ii) Using LAD, a smaller teacher can also improve a larger student significantly, while soft-label can't. We then introduce Co-learning LAD, in which two networks simultaneously learn from scratch and the role of teacher and student are dynamically interchanged. Using PAA-ResNet50 as a teacher, our LAD techniques can improve detectors PAA-ResNet101 and PAA-ResNeXt101 to 46AP and 47.5AP on the COCO test-dev set. With a stronger teacher PAA-SwinB, we improve the students PAA-ResNet50 to 43.7AP by only 1x schedule training and standard setting, and PAA-ResNet101 to 47.9AP, significantly surpassing the current methods.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143894499-c2a3a243-988f-4604-915b-17918732bf03.png"/>
+</div>
+
+## Results and Models
+
+We provide config files to reproduce the object detection results in the
+WACV 2022 paper for Improving Object Detection by Label Assignment
+Distillation.
+
+### PAA with LAD
+
+| Teacher | Student | Training schedule | AP (val) |                                                   Config                                                    |                                                                                                                                               Download                                                                                                                                               |
+| :-----: | :-----: | :---------------: | :------: | :---------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   --    |  R-50   |        1x         |   40.4   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_1x_coco.py)      |                     [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.log.json)                      |
+|   --    |  R-101  |        1x         |   42.6   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r101_fpn_1x_coco.py)     |                   [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.log.json)                    |
+|  R-101  |  R-50   |        1x         |   41.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lad/lad_r50_paa_r101_fpn_coco_1x.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r50_paa_r101_fpn_coco_1x/lad_r50_paa_r101_fpn_coco_1x_20220708_124246-74c76ff0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r50_paa_r101_fpn_coco_1x/lad_r50_paa_r101_fpn_coco_1x_20220708_124246.log.json) |
+|  R-50   |  R-101  |        1x         |   43.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lad/lad_r101_paa_r50_fpn_coco_1x.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r101_paa_r50_fpn_coco_1x/lad_r101_paa_r50_fpn_coco_1x_20220708_124357-9407ac54.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r101_paa_r50_fpn_coco_1x/lad_r101_paa_r50_fpn_coco_1x_20220708_124357.log.json) |
+
+## Note
+
+- Meaning of Config name: lad_r50(student model)\_paa(based on paa)\_r101(teacher model)\_fpn(neck)\_coco(dataset)\_1x(12 epoch).py
+- Results may fluctuate by about 0.2 mAP.
+
+## Citation
+
+```latex
+@inproceedings{nguyen2021improving,
+  title={Improving Object Detection by Label Assignment Distillation},
+  author={Chuong H. Nguyen and Thuy C. Nguyen and Tuan N. Tang and Nam L. H. Phan},
+  booktitle = {WACV},
+  year={2022}
+}
+```
diff --git a/configs/lad/lad_r101_paa_r50_fpn_coco_1x.py b/configs/lad/lad_r101_paa_r50_fpn_coco_1x.py
new file mode 100755
index 0000000..4877d95
--- /dev/null
+++ b/configs/lad/lad_r101_paa_r50_fpn_coco_1x.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth'  # noqa
+model = dict(
+    type='LAD',
+    # student
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # teacher
+    teacher_ckpt=teacher_ckpt,
+    teacher_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    teacher_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    teacher_bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.1,
+            neg_iou_thr=0.1,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        score_voting=True,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+data = dict(samples_per_gpu=8, workers_per_gpu=4)
+optimizer = dict(lr=0.01)
+fp16 = dict(loss_scale=512.)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/lad/lad_r50_paa_r101_fpn_coco_1x.py b/configs/lad/lad_r50_paa_r101_fpn_coco_1x.py
new file mode 100755
index 0000000..29bbe69
--- /dev/null
+++ b/configs/lad/lad_r50_paa_r101_fpn_coco_1x.py
@@ -0,0 +1,125 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+teacher_ckpt = 'http://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth'  # noqa
+model = dict(
+    type='LAD',
+    # student
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # teacher
+    teacher_ckpt=teacher_ckpt,
+    teacher_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    teacher_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    teacher_bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.1,
+            neg_iou_thr=0.1,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        score_voting=True,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+data = dict(samples_per_gpu=8, workers_per_gpu=4)
+optimizer = dict(lr=0.01)
+fp16 = dict(loss_scale=512.)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/lad/metafile.yml b/configs/lad/metafile.yml
new file mode 100755
index 0000000..11a9fa9
--- /dev/null
+++ b/configs/lad/metafile.yml
@@ -0,0 +1,45 @@
+Collections:
+  - Name: Label Assignment Distillation
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Label Assignment Distillation
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 2x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2108.10520
+      Title: 'Improving Object Detection by Label Assignment Distillation'
+    README: configs/lad/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.19.0/mmdet/models/detectors/lad.py#L10
+      Version: v2.19.0
+
+Models:
+  - Name: lad_r101_paa_r50_fpn_coco_1x
+    In Collection: Label Assignment Distillation
+    Config: configs/lad/lad_r101_paa_r50_fpn_coco_1x.py
+    Metadata:
+      Training Memory (GB): 12.4
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 43.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r101_paa_r50_fpn_coco_1x/lad_r101_paa_r50_fpn_coco_1x_20220708_124357-9407ac54.pth
+  - Name: lad_r50_paa_r101_fpn_coco_1x
+    In Collection: Label Assignment Distillation
+    Config: configs/lad/lad_r50_paa_r101_fpn_coco_1x.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r50_paa_r101_fpn_coco_1x/lad_r50_paa_r101_fpn_coco_1x_20220708_124246-74c76ff0.pth
diff --git a/configs/ld/README.md b/configs/ld/README.md
new file mode 100755
index 0000000..0109729
--- /dev/null
+++ b/configs/ld/README.md
@@ -0,0 +1,43 @@
+# LD
+
+> [Localization Distillation for Dense Object Detection](https://arxiv.org/abs/2102.12252)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Knowledge distillation (KD) has witnessed its powerful capability in learning compact models in object detection. Previous KD methods for object detection mostly focus on imitating deep features within the imitation regions instead of mimicking classification logits due to its inefficiency in distilling localization information. In this paper, by reformulating the knowledge distillation process on localization, we present a novel localization distillation (LD) method which can efficiently transfer the localization knowledge from the teacher to the student. Moreover, we also heuristically introduce the concept of valuable localization region that can aid to selectively distill the semantic and localization knowledge for a certain region. Combining these two new components, for the first time, we show that logit mimicking can outperform feature imitation and localization knowledge distillation is more important and efficient than semantic knowledge for distilling object detectors. Our distillation scheme is simple as well as effective and can be easily applied to different dense object detectors. Experiments show that our LD can boost the AP score of GFocal-ResNet-50 with a single-scale 1× training schedule from 40.1 to 42.1 on the COCO benchmark without any sacrifice on the inference speed.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143966265-48a03668-8585-4525-8a86-afa2209d1602.png"/>
+</div>
+
+## Results and Models
+
+### GFocalV1 with LD
+
+|  Teacher  | Student | Training schedule | Mini-batch size | AP (val) |                                                     Config                                                      |                                                                                                                                                        Download                                                                                                                                                        |
+| :-------: | :-----: | :---------------: | :-------------: | :------: | :-------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    --     |  R-18   |        1x         |        6        |   35.8   |                                                                                                                 |                                                                                                                                                                                                                                                                                                                        |
+|   R-101   |  R-18   |        1x         |        6        |   36.5   |   [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py)   |         [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r18_gflv1_r101_fpn_coco_1x/ld_r18_gflv1_r101_fpn_coco_1x_20220702_062206-330e6332.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r18_gflv1_r101_fpn_coco_1x/ld_r18_gflv1_r101_fpn_coco_1x_20220702_062206.log.json)         |
+|    --     |  R-34   |        1x         |        6        |   38.9   |                                                                                                                 |                                                                                                                                                                                                                                                                                                                        |
+|   R-101   |  R-34   |        1x         |        6        |   39.9   |   [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r34_gflv1_r101_fpn_coco_1x.py)   |         [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r34_gflv1_r101_fpn_coco_1x/ld_r34_gflv1_r101_fpn_coco_1x_20220630_134007-9bc69413.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r34_gflv1_r101_fpn_coco_1x/ld_r34_gflv1_r101_fpn_coco_1x_20220630_134007.log.json)         |
+|    --     |  R-50   |        1x         |        6        |   40.1   |                                                                                                                 |                                                                                                                                                                                                                                                                                                                        |
+|   R-101   |  R-50   |        1x         |        6        |   41.0   |   [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r50_gflv1_r101_fpn_coco_1x.py)   |         [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r50_gflv1_r101_fpn_coco_1x/ld_r50_gflv1_r101_fpn_coco_1x_20220629_145355-8dc5bad8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r50_gflv1_r101_fpn_coco_1x/ld_r50_gflv1_r101_fpn_coco_1x_20220629_145355.log.json)         |
+|    --     |  R-101  |        2x         |        6        |   44.6   |                                                                                                                 |                                                                                                                                                                                                                                                                                                                        |
+| R-101-DCN |  R-101  |        2x         |        6        |   45.5   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x/ld_r101_gflv1_r101dcn_fpn_coco_2x_20220629_185920-9e658426.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x/ld_r101_gflv1_r101dcn_fpn_coco_2x_20220629_185920.log.json) |
+
+## Note
+
+- Meaning of Config name: ld_r18(student model)\_gflv1(based on gflv1)\_r101(teacher model)\_fpn(neck)\_coco(dataset)\_1x(12 epoch).py
+
+## Citation
+
+```latex
+@Inproceedings{zheng2022LD,
+  title={Localization Distillation for Dense Object Detection},
+  author= {Zheng, Zhaohui and Ye, Rongguang and Wang, Ping and Ren, Dongwei and Zuo, Wangmeng and Hou, Qibin and Cheng, Mingming},
+  booktitle={CVPR},
+  year={2022}
+}
+```
diff --git a/configs/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py b/configs/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py
new file mode 100755
index 0000000..1cbdb4c
--- /dev/null
+++ b/configs/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py
@@ -0,0 +1,44 @@
+_base_ = ['./ld_r18_gflv1_r101_fpn_coco_1x.py']
+teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth'  # noqa
+model = dict(
+    teacher_config='configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py',
+    teacher_ckpt=teacher_ckpt,
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5))
+
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+# multi-scale training
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 480), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py b/configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py
new file mode 100755
index 0000000..18dce81
--- /dev/null
+++ b/configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py
@@ -0,0 +1,62 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth'  # noqa
+model = dict(
+    type='KnowledgeDistillationSingleStageDetector',
+    teacher_config='configs/gfl/gfl_r101_fpn_mstrain_2x_coco.py',
+    teacher_ckpt=teacher_ckpt,
+    backbone=dict(
+        type='ResNet',
+        depth=18,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 256, 512],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='LDHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_ld=dict(
+            type='KnowledgeDistillationKLDivLoss', loss_weight=0.25, T=10),
+        reg_max=16,
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/ld/ld_r34_gflv1_r101_fpn_coco_1x.py b/configs/ld/ld_r34_gflv1_r101_fpn_coco_1x.py
new file mode 100755
index 0000000..3b6996d
--- /dev/null
+++ b/configs/ld/ld_r34_gflv1_r101_fpn_coco_1x.py
@@ -0,0 +1,19 @@
+_base_ = ['./ld_r18_gflv1_r101_fpn_coco_1x.py']
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=34,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet34')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 256, 512],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5))
diff --git a/configs/ld/ld_r50_gflv1_r101_fpn_coco_1x.py b/configs/ld/ld_r50_gflv1_r101_fpn_coco_1x.py
new file mode 100755
index 0000000..2b18785
--- /dev/null
+++ b/configs/ld/ld_r50_gflv1_r101_fpn_coco_1x.py
@@ -0,0 +1,19 @@
+_base_ = ['./ld_r18_gflv1_r101_fpn_coco_1x.py']
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5))
diff --git a/configs/ld/metafile.yml b/configs/ld/metafile.yml
new file mode 100755
index 0000000..2055e32
--- /dev/null
+++ b/configs/ld/metafile.yml
@@ -0,0 +1,69 @@
+Collections:
+  - Name: Localization Distillation
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Localization Distillation
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2102.12252
+      Title: 'Localization Distillation for Dense Object Detection'
+    README: configs/ld/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.11.0/mmdet/models/dense_heads/ld_head.py#L11
+      Version: v2.11.0
+
+Models:
+  - Name: ld_r18_gflv1_r101_fpn_coco_1x
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py
+    Metadata:
+      Training Memory (GB): 1.8
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r18_gflv1_r101_fpn_coco_1x/ld_r18_gflv1_r101_fpn_coco_1x_20220702_062206-330e6332.pth
+  - Name: ld_r34_gflv1_r101_fpn_coco_1x
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r34_gflv1_r101_fpn_coco_1x.py
+    Metadata:
+      Training Memory (GB): 2.2
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r34_gflv1_r101_fpn_coco_1x/ld_r34_gflv1_r101_fpn_coco_1x_20220630_134007-9bc69413.pth
+  - Name: ld_r50_gflv1_r101_fpn_coco_1x
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r50_gflv1_r101_fpn_coco_1x.py
+    Metadata:
+      Training Memory (GB): 3.6
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r50_gflv1_r101_fpn_coco_1x/ld_r50_gflv1_r101_fpn_coco_1x_20220629_145355-8dc5bad8.pth
+  - Name: ld_r101_gflv1_r101dcn_fpn_coco_2x
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 24
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 45.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x/ld_r101_gflv1_r101dcn_fpn_coco_2x_20220629_185920-9e658426.pth
diff --git a/configs/legacy_1.x/README.md b/configs/legacy_1.x/README.md
new file mode 100755
index 0000000..c48477f
--- /dev/null
+++ b/configs/legacy_1.x/README.md
@@ -0,0 +1,54 @@
+# Legacy Configs in MMDetection V1.x
+
+<!-- [OTHERS] -->
+
+Configs in this directory implement the legacy configs used by MMDetection V1.x and its model zoos.
+
+To help users convert their models from V1.x to MMDetection V2.0, we provide v1.x configs to inference the converted v1.x models.
+Due to the BC-breaking changes in MMDetection V2.0 from MMDetection V1.x, running inference with the same model weights in these two version will produce different results. The difference will cause within 1% AP absolute difference as can be found in the following table.
+
+## Usage
+
+To upgrade the model version, the users need to do the following steps.
+
+### 1. Convert model weights
+
+There are three main difference in the model weights between V1.x and V2.0 codebases.
+
+1. Since the class order in all the detector's classification branch is reordered, all the legacy model weights need to go through the conversion process.
+2. The regression and segmentation head no longer contain the background channel. Weights in these background channels should be removed to fix in the current codebase.
+3. For two-stage detectors, their wegihts need to be upgraded since MMDetection V2.0 refactors all the two-stage detectors with `RoIHead`.
+
+The users can do the same modification as mentioned above for the self-implemented
+detectors. We provide a scripts `tools/model_converters/upgrade_model_version.py` to convert the model weights in the V1.x model zoo.
+
+```bash
+python tools/model_converters/upgrade_model_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH} --num-classes ${NUM_CLASSES}
+
+```
+
+- OLD_MODEL_PATH: the path to load the model weights in 1.x version.
+- NEW_MODEL_PATH: the path to save the converted model weights in 2.0 version.
+- NUM_CLASSES: number of classes of the original model weights. Usually it is 81 for COCO dataset, 21 for VOC dataset.
+  The number of classes in V2.0 models should be equal to that in V1.x models - 1.
+
+### 2. Use configs with legacy settings
+
+After converting the model weights, checkout to the v1.2 release to find the corresponding config file that uses the legacy settings.
+The V1.x models usually need these three legacy modules: `LegacyAnchorGenerator`, `LegacyDeltaXYWHBBoxCoder`, and `RoIAlign(align=False)`.
+For models using ResNet Caffe backbones, they also need to change the pretrain name and the corresponding `img_norm_cfg`.
+An example is in [`retinanet_r50_caffe_fpn_1x_coco_v1.py`](retinanet_r50_caffe_fpn_1x_coco_v1.py)
+Then use the config to test the model weights. For most models, the obtained results should be close to that in V1.x.
+We provide configs of some common structures in this directory.
+
+## Performance
+
+The performance change after converting the models in this directory are listed as the following.
+
+|           Method            |  Style  | Lr schd | V1.x box AP | V1.x mask AP | V2.0 box AP | V2.0 mask AP |                                                           Config                                                           |                                                             Download                                                              |
+| :-------------------------: | :-----: | :-----: | :---------: | :----------: | :---------: | :----------: | :------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------: |
+|     Mask R-CNN R-50-FPN     | pytorch |   1x    |    37.3     |     34.2     |    36.8     |     33.9     |     [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py)     |     [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth)     |
+|     RetinaNet R-50-FPN      |  caffe  |   1x    |    35.8     |      -       |    35.4     |      -       |    [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/retinanet_r50_caffe_1x_coco_v1.py)    |                                                                                                                                   |
+|     RetinaNet R-50-FPN      | pytorch |   1x    |    35.6     |      -       |    35.2     |      -       |     [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py)     |     [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_1x_20181125-7b0c2548.pth)     |
+| Cascade Mask R-CNN R-50-FPN | pytorch |   1x    |    41.2     |     35.7     |    40.8     |     35.6     | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_1x_20181123-88b170c9.pth) |
+|        SSD300-VGG16         |  caffe  |  120e   |    25.7     |      -       |    25.4     |      -       |            [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/legacy_1.x/ssd300_coco_v1.py)            | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_coco_vgg16_caffe_120e_20181221-84d7110b.pth) |
diff --git a/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py b/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py
new file mode 100755
index 0000000..fc9d004
--- /dev/null
+++ b/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,79 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5),
+        bbox_coder=dict(
+            type='LegacyDeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0])),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                reg_class_agnostic=True,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='LegacyDeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2])),
+            dict(
+                type='Shared2FCBBoxHead',
+                reg_class_agnostic=True,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='LegacyDeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1])),
+            dict(
+                type='Shared2FCBBoxHead',
+                reg_class_agnostic=True,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='LegacyDeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067])),
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False))))
+dist_params = dict(backend='nccl', port=29515)
diff --git a/configs/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py b/configs/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py
new file mode 100755
index 0000000..8c573be
--- /dev/null
+++ b/configs/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='FasterRCNN',
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    rpn_head=dict(
+        type='RPNHead',
+        anchor_generator=dict(
+            type='LegacyAnchorGenerator',
+            center_offset=0.5,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn_proposal=dict(max_per_img=2000),
+        rcnn=dict(assigner=dict(match_low_quality=True))))
diff --git a/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py b/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py
new file mode 100755
index 0000000..04581bb
--- /dev/null
+++ b/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    rpn_head=dict(
+        anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5),
+        bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False)),
+        bbox_head=dict(
+            bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+
+    # model training and testing settings
+    train_cfg=dict(
+        rpn_proposal=dict(max_per_img=2000),
+        rcnn=dict(assigner=dict(match_low_quality=True))))
diff --git a/configs/legacy_1.x/retinanet_r50_caffe_fpn_1x_coco_v1.py b/configs/legacy_1.x/retinanet_r50_caffe_fpn_1x_coco_v1.py
new file mode 100755
index 0000000..a63d248
--- /dev/null
+++ b/configs/legacy_1.x/retinanet_r50_caffe_fpn_1x_coco_v1.py
@@ -0,0 +1,41 @@
+_base_ = './retinanet_r50_fpn_1x_coco_v1.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py b/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py
new file mode 100755
index 0000000..6198b97
--- /dev/null
+++ b/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,17 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    bbox_head=dict(
+        type='RetinaHead',
+        anchor_generator=dict(
+            type='LegacyAnchorGenerator',
+            center_offset=0.5,
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)))
diff --git a/configs/legacy_1.x/ssd300_coco_v1.py b/configs/legacy_1.x/ssd300_coco_v1.py
new file mode 100755
index 0000000..65ccc1e
--- /dev/null
+++ b/configs/legacy_1.x/ssd300_coco_v1.py
@@ -0,0 +1,84 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+input_size = 300
+model = dict(
+    bbox_head=dict(
+        type='SSDHead',
+        anchor_generator=dict(
+            type='LegacySSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
+        bbox_coder=dict(
+            type='LegacyDeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict(_delete_=True)
+dist_params = dict(backend='nccl', port=29555)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/libra_rcnn/README.md b/configs/libra_rcnn/README.md
new file mode 100755
index 0000000..87a128a
--- /dev/null
+++ b/configs/libra_rcnn/README.md
@@ -0,0 +1,53 @@
+# Libra R-CNN
+
+> [Libra R-CNN: Towards Balanced Learning for Object Detection](https://arxiv.org/abs/1904.02701)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Compared with model architectures, the training process, which is also crucial to the success of detectors, has received relatively less attention in object detection. In this work, we carefully revisit the standard training practice of detectors, and find that the detection performance is often limited by the imbalance during the training process, which generally consists in three levels - sample level, feature level, and objective level. To mitigate the adverse effects caused thereby, we propose Libra R-CNN, a simple but effective framework towards balanced learning for object detection. It integrates three novel components: IoU-balanced sampling, balanced feature pyramid, and balanced L1 loss, respectively for reducing the imbalance at sample, feature, and objective level. Benefitted from the overall balanced design, Libra R-CNN significantly improves the detection performance. Without bells and whistles, it achieves 2.5 points and 2.0 points higher Average Precision (AP) than FPN Faster R-CNN and RetinaNet respectively on MSCOCO.
+
+Instance recognition is rapidly advanced along with the developments of various deep convolutional neural networks. Compared to the architectures of networks, the training process, which is also crucial to the success of detectors, has received relatively less attention. In this work, we carefully revisit the standard training practice of detectors, and find that the detection performance is often limited by the imbalance during the training process, which generally consists in three levels - sample level, feature level, and objective level. To mitigate the adverse effects caused thereby, we propose Libra R-CNN, a simple yet effective framework towards balanced learning for instance recognition. It integrates IoU-balanced sampling, balanced feature pyramid, and objective re-weighting, respectively for reducing the imbalance at sample, feature, and objective level. Extensive experiments conducted on MS COCO, LVIS and Pascal VOC datasets prove the effectiveness of the overall balanced design.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143966392-2633684c-a67a-4269-b71b-afe945c67bcd.png"/>
+</div>
+
+## Results and Models
+
+The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val)
+
+| Architecture |    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                             Config                                                             |                                                                                                                                                                          Download                                                                                                                                                                           |
+| :----------: | :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :----------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Faster R-CNN |    R-50-FPN     | pytorch |   1x    |   4.6    |      19.0      |  38.3  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json)               |
+|  Fast R-CNN  |    R-50-FPN     | pytorch |   1x    |          |                |        |                                                                                                                                |                                                                                                                                                                                                                                                                                                                                                             |
+| Faster R-CNN |    R-101-FPN    | pytorch |   1x    |   6.5    |      14.4      |  40.1  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203-8dba6a5a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203_001405.log.json)             |
+| Faster R-CNN | X-101-64x4d-FPN | pytorch |   1x    |   10.8   |      8.5       |  42.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315-3a7d0488.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315_231625.log.json) |
+|  RetinaNet   |    R-50-FPN     | pytorch |   1x    |   4.2    |      17.7      |  37.6  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205-804d94ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205_112757.log.json)                   |
+
+## Citation
+
+We provide config files to reproduce the results in the CVPR 2019 paper [Libra R-CNN](https://arxiv.org/pdf/1904.02701.pdf).
+
+The extended version of [Libra R-CNN](https://arxiv.org/pdf/2108.10175.pdf) is accpeted by IJCV.
+
+```latex
+@inproceedings{pang2019libra,
+  title={Libra R-CNN: Towards Balanced Learning for Object Detection},
+  author={Pang, Jiangmiao and Chen, Kai and Shi, Jianping and Feng, Huajun and Ouyang, Wanli and Dahua Lin},
+  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+  year={2019}
+}
+
+@article{pang2021towards,
+  title={Towards Balanced Learning for Instance Recognition},
+  author={Pang, Jiangmiao and Chen, Kai and Li, Qi and Xu, Zhihai and Feng, Huajun and Shi, Jianping and Ouyang, Wanli and Lin, Dahua},
+  journal={International Journal of Computer Vision},
+  volume={129},
+  number={5},
+  pages={1376--1393},
+  year={2021},
+  publisher={Springer}
+}
+```
diff --git a/configs/libra_rcnn/libra_fast_rcnn_r50_fpn_1x_coco.py b/configs/libra_rcnn/libra_fast_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..efbedc8
--- /dev/null
+++ b/configs/libra_rcnn/libra_fast_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,50 @@
+_base_ = '../fast_rcnn/fast_rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            num_outs=5),
+        dict(
+            type='BFP',
+            in_channels=256,
+            num_levels=5,
+            refine_level=2,
+            refine_type='non_local')
+    ],
+    roi_head=dict(
+        bbox_head=dict(
+            loss_bbox=dict(
+                _delete_=True,
+                type='BalancedL1Loss',
+                alpha=0.5,
+                gamma=1.5,
+                beta=1.0,
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            sampler=dict(
+                _delete_=True,
+                type='CombinedSampler',
+                num=512,
+                pos_fraction=0.25,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(
+                    type='IoUBalancedNegSampler',
+                    floor_thr=-1,
+                    floor_fraction=0,
+                    num_bins=3)))))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+data = dict(
+    train=dict(proposal_file=data_root +
+               'libra_proposals/rpn_r50_fpn_1x_train2017.pkl'),
+    val=dict(proposal_file=data_root +
+             'libra_proposals/rpn_r50_fpn_1x_val2017.pkl'),
+    test=dict(proposal_file=data_root +
+              'libra_proposals/rpn_r50_fpn_1x_val2017.pkl'))
diff --git a/configs/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py b/configs/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..e899706
--- /dev/null
+++ b/configs/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './libra_faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py b/configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..89a0d7b
--- /dev/null
+++ b/configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,41 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            num_outs=5),
+        dict(
+            type='BFP',
+            in_channels=256,
+            num_levels=5,
+            refine_level=2,
+            refine_type='non_local')
+    ],
+    roi_head=dict(
+        bbox_head=dict(
+            loss_bbox=dict(
+                _delete_=True,
+                type='BalancedL1Loss',
+                alpha=0.5,
+                gamma=1.5,
+                beta=1.0,
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(sampler=dict(neg_pos_ub=5), allowed_border=-1),
+        rcnn=dict(
+            sampler=dict(
+                _delete_=True,
+                type='CombinedSampler',
+                num=512,
+                pos_fraction=0.25,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(
+                    type='IoUBalancedNegSampler',
+                    floor_thr=-1,
+                    floor_fraction=0,
+                    num_bins=3)))))
diff --git a/configs/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..06740a7
--- /dev/null
+++ b/configs/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './libra_faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py b/configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..be27420
--- /dev/null
+++ b/configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,26 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_input',
+            num_outs=5),
+        dict(
+            type='BFP',
+            in_channels=256,
+            num_levels=5,
+            refine_level=1,
+            refine_type='non_local')
+    ],
+    bbox_head=dict(
+        loss_bbox=dict(
+            _delete_=True,
+            type='BalancedL1Loss',
+            alpha=0.5,
+            gamma=1.5,
+            beta=0.11,
+            loss_weight=1.0)))
diff --git a/configs/libra_rcnn/metafile.yml b/configs/libra_rcnn/metafile.yml
new file mode 100755
index 0000000..8c32795
--- /dev/null
+++ b/configs/libra_rcnn/metafile.yml
@@ -0,0 +1,99 @@
+Collections:
+  - Name: Libra R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - IoU-Balanced Sampling
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Balanced Feature Pyramid
+    Paper:
+      URL: https://arxiv.org/abs/1904.02701
+      Title: 'Libra R-CNN: Towards Balanced Learning for Object Detection'
+    README: configs/libra_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/bfp.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: libra_faster_rcnn_r50_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      inference time (ms/im):
+        - value: 52.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth
+
+  - Name: libra_faster_rcnn_r101_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      inference time (ms/im):
+        - value: 69.44
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203-8dba6a5a.pth
+
+  - Name: libra_faster_rcnn_x101_64x4d_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      inference time (ms/im):
+        - value: 117.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315-3a7d0488.pth
+
+  - Name: libra_retinanet_r50_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 56.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205-804d94ce.pth
diff --git a/configs/lvis/README.md b/configs/lvis/README.md
new file mode 100755
index 0000000..0c2760e
--- /dev/null
+++ b/configs/lvis/README.md
@@ -0,0 +1,56 @@
+# LVIS
+
+> [LVIS: A Dataset for Large Vocabulary Instance Segmentation](https://arxiv.org/abs/1908.03195)
+
+<!-- [DATASET] -->
+
+## Abstract
+
+Progress on object detection is enabled by datasets that focus the research community's attention on open challenges. This process led us from simple images to complex scenes and from bounding boxes to segmentation masks. In this work, we introduce LVIS (pronounced \`el-vis'): a new dataset for Large Vocabulary Instance Segmentation. We plan to collect ~2 million high-quality instance segmentation masks for over 1000 entry-level object categories in 164k images. Due to the Zipfian distribution of categories in natural images, LVIS naturally has a long tail of categories with few training samples. Given that state-of-the-art deep learning methods for object detection perform poorly in the low-sample regime, we believe that our dataset poses an important and exciting new scientific challenge.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143967423-85b9b705-05ea-4bbc-9a41-eccc14240c7a.png" height="300"/>
+</div>
+
+## Common Setting
+
+- Please follow [install guide](../../docs/get_started.md#install-mmdetection) to install open-mmlab forked cocoapi first.
+
+- Run following scripts to install our forked lvis-api.
+
+  ```shell
+  pip install git+https://github.com/lvis-dataset/lvis-api.git
+  ```
+
+- All experiments use oversample strategy [here](../../docs/tutorials/customize_dataset.md#class-balanced-dataset) with oversample threshold `1e-3`.
+
+- The size of LVIS v0.5 is half of COCO, so schedule `2x` in LVIS is roughly the same iterations as `1x` in COCO.
+
+## Results and models of LVIS v0.5
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                  Config                                                                  |                                                                                                                                                                                      Download                                                                                                                                                                                      |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |   2x    |    -     |       -        |  26.1  |  25.9   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis-dbd06831.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_20200531_160435.log.json)               |
+|    R-101-FPN    | pytorch |   2x    |    -     |       -        |  27.1  |  27.0   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis-54582ee2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_20200601_134748.log.json)             |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |       -        |  26.7  |  26.9   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis-3cf55ea2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_20200531_221749.log.json) |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |       -        |  26.4  |  26.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis-1c99a5ad.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_20200601_194651.log.json) |
+
+## Results and models of LVIS v1
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                 Config                                                                 |                                                                                                                                                                                            Download                                                                                                                                                                                            |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |   1x    |   9.1    |       -        |  22.5  |  21.7   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-aa78ac3d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_061305.log.json)               |
+|    R-101-FPN    | pytorch |   1x    |   10.8   |       -        |  24.6  |  23.6   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-ec55ce32.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_070959.log.json)             |
+| X-101-32x4d-FPN | pytorch |   1x    |   11.8   |       -        |  26.7  |  25.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-ebbc5c81.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_071317.log.json) |
+| X-101-64x4d-FPN | pytorch |   1x    |   14.6   |       -        |  27.2  |  25.8   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-43d9edfe.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-20200830_060206.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{gupta2019lvis,
+  title={{LVIS}: A Dataset for Large Vocabulary Instance Segmentation},
+  author={Gupta, Agrim and Dollar, Piotr and Girshick, Ross},
+  booktitle={Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
+  year={2019}
+}
+```
diff --git a/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py
new file mode 100755
index 0000000..0f017f5
--- /dev/null
+++ b/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
new file mode 100755
index 0000000..637f4a6
--- /dev/null
+++ b/configs/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py
new file mode 100755
index 0000000..92ddb52
--- /dev/null
+++ b/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v1_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=1203), mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
new file mode 100755
index 0000000..d53c5dc
--- /dev/null
+++ b/configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v0.5_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=1230), mask_head=dict(num_classes=1230)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py
new file mode 100755
index 0000000..a6115c1
--- /dev/null
+++ b/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
new file mode 100755
index 0000000..96b6252
--- /dev/null
+++ b/configs/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py b/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py
new file mode 100755
index 0000000..0f95a73
--- /dev/null
+++ b/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py b/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
new file mode 100755
index 0000000..986acda
--- /dev/null
+++ b/configs/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_v0.5.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/mask2former/README.md b/configs/mask2former/README.md
new file mode 100755
index 0000000..ebce50d
--- /dev/null
+++ b/configs/mask2former/README.md
@@ -0,0 +1,73 @@
+# Mask2Former
+
+> [Masked-attention Mask Transformer for Universal Image Segmentation](http://arxiv.org/abs/2112.01527)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).
+
+<div align=center>
+<img src="https://camo.githubusercontent.com/455d3116845b1d580b1f8a8542334b9752fdf39364deee2951cdd231524c7725/68747470733a2f2f626f77656e63303232312e6769746875622e696f2f696d616765732f6d61736b666f726d657276325f7465617365722e706e67" height="300"/>
+</div>
+
+## Introduction
+
+Mask2Former requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path.
+The directory should be like this.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+|   |   |   ├── instances_train2017.json
+|   |   |   ├── instances_val2017.json
+│   │   │   ├── panoptic_train2017.json
+│   │   │   ├── panoptic_train2017
+│   │   │   ├── panoptic_val2017.json
+│   │   │   ├── panoptic_val2017
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+```
+
+## Results and Models
+
+### Panoptic segmentation
+
+| Backbone |  style  |   Pretrain   | Lr schd | Mem (GB) | Inf time (fps) |  PQ  | box mAP | mask mAP |                                                                         Config                                                                         |                                                                                                                                                                                                                             Download                                                                                                                                                                                                                             |
+| :------: | :-----: | :----------: | :-----: | :------: | :------------: | :--: | :-----: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch | ImageNet-1K  |   50e   |   13.9   |       -        | 51.9 |  44.8   |   41.9   |            [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py)            |                                             [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic/mask2former_r50_lsj_8x2_50e_coco-panoptic_20220326_224516-11a44721.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic/mask2former_r50_lsj_8x2_50e_coco-panoptic_20220326_224516.log.json)                                             |
+|  R-101   | pytorch | ImageNet-1K  |   50e   |   16.1   |       -        | 52.4 |  45.3   |   42.4   |           [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py)            |                                           [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic/mask2former_r101_lsj_8x2_50e_coco-panoptic_20220329_225104-c54e64c9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic/mask2former_r101_lsj_8x2_50e_coco-panoptic_20220329_225104.log.json)                                           |
+|  Swin-T  |    -    | ImageNet-1K  |   50e   |   15.9   |       -        | 53.4 |  46.3   |   43.4   |     [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220326_224553-fc567107.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220326_224553.log.json)                   |
+|  Swin-S  |    -    | ImageNet-1K  |   50e   |   19.1   |       -        | 54.5 |  47.8   |   44.5   |     [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220329_225200-c7b94355.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220329_225200.log.json)                   |
+|  Swin-B  |    -    | ImageNet-1K  |   50e   |   26.0   |       -        | 55.1 |  48.2   |   44.9   |     [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic_20220331_002244-c149a9e9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic_20220331_002244.log.json)                 |
+|  Swin-B  |    -    | ImageNet-21K |   50e   |   25.8   |       -        | 56.3 |  50.0   |   46.3   |  [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py)  |     [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic_20220329_230021-3bb8b482.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic_20220329_230021.log.json)     |
+|  Swin-L  |    -    | ImageNet-21K |  100e   |   21.1   |       -        | 57.6 |  52.2   |   48.5   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic_20220407_104949-d4919c44.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic_20220407_104949.log.json) |
+
+### Instance segmentation
+
+| Backbone | style   | Pretrain    | Lr schd | Mem (GB) | Inf time (fps) | box mAP | mask mAP | Config                                                                                                                               | Download                                                                                                                                                                                                                                                                                                                                                                                 |
+| -------- | ------- | ----------- | ------- | -------- | -------------- | ------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| R-50     | pytorch | ImageNet-1K | 50e     | 13.7     | -              | 45.7    | 42.9     | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py)              | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220506_191028.log.json)                                                     |
+| R-101    | pytorch | ImageNet-1K | 50e     | 15.5     | -              | 46.7    | 44.0     | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py)             | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220426_100250-c50b6fa6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220426_100250.log.json)                                                 |
+| Swin-T   | -       | ImageNet-1K | 50e     | 15.3     | -              | 47.7    | 44.7     | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220508_091649-4a943037.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220508_091649.log.json) |
+| Swin-S   | -       | ImageNet-1K | 50e     | 18.8     | -              | 49.3    | 46.1     | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220504_001756-743b7d99.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220504_001756.log.json) |
+
+Note: We have trained the instance segmentation models many times (see more details in [PR 7571](https://github.com/open-mmlab/mmdetection/pull/7571)). The results of the trained models are relatively stable (+- 0.2), and have a certain gap (about 0.2 AP) in comparison with the results in the [paper](http://arxiv.org/abs/2112.01527). However, the performance of the model trained with the official code is unstable and may also be slightly lower than the reported results as mentioned in the [issue](https://github.com/facebookresearch/Mask2Former/issues/46).
+
+## Citation
+
+```latex
+@article{cheng2021mask2former,
+  title={Masked-attention Mask Transformer for Universal Image Segmentation},
+  author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
+  journal={arXiv},
+  year={2021}
+}
+```
diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py
new file mode 100755
index 0000000..33fdde6
--- /dev/null
+++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py
@@ -0,0 +1,7 @@
+_base_ = './mask2former_r50_lsj_8x2_50e_coco-panoptic.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py
new file mode 100755
index 0000000..5543fb0
--- /dev/null
+++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py
new file mode 100755
index 0000000..2c23625
--- /dev/null
+++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py
@@ -0,0 +1,253 @@
+_base_ = [
+    '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
+]
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type='MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0)),
+    panoptic_fusion_head=dict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        assigner=dict(
+            type='MaskHungarianAssigner',
+            cls_cost=dict(type='ClassificationCost', weight=2.0),
+            mask_cost=dict(
+                type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
+            dice_cost=dict(
+                type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=True,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        iou_thr=0.8,
+        # In Mask2Former's panoptic postprocessing,
+        # it will filter mask area where score is less than 0.5 .
+        filter_low_score=True),
+    init_cfg=None)
+
+# dataset settings
+image_size = (1024, 1024)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    # large scale jittering
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(0.1, 2.0),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=image_size,
+        crop_type='absolute',
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=image_size),
+    dict(type='DefaultFormatBundle', img_to_float=True),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data_root = 'data/coco/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(
+        pipeline=test_pipeline,
+        ins_ann_file=data_root + 'annotations/instances_val2017.json',
+    ),
+    test=dict(
+        pipeline=test_pipeline,
+        ins_ann_file=data_root + 'annotations/instances_val2017.json',
+    ))
+
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    gamma=0.1,
+    by_epoch=False,
+    step=[327778, 355092],
+    warmup='linear',
+    warmup_by_epoch=False,
+    warmup_ratio=1.0,  # no warmup
+    warmup_iters=10)
+
+max_iters = 368750
+runner = dict(type='IterBasedRunner', max_iters=max_iters)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', by_epoch=False),
+        dict(type='TensorboardLoggerHook', by_epoch=False)
+    ])
+interval = 5000
+workflow = [('train', interval)]
+checkpoint_config = dict(
+    by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3)
+
+# Before 365001th iteration, we do evaluation every 5000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means that we do evaluation at the end of training.
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+evaluation = dict(
+    interval=interval,
+    dynamic_intervals=dynamic_intervals,
+    metric=['PQ', 'bbox', 'segm'])
diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
new file mode 100755
index 0000000..eca6135
--- /dev/null
+++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
@@ -0,0 +1,79 @@
+_base_ = ['./mask2former_r50_lsj_8x2_50e_coco-panoptic.py']
+num_things_classes = 80
+num_stuff_classes = 0
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    panoptic_head=dict(
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_cls=dict(class_weight=[1.0] * num_classes + [0.1])),
+    panoptic_fusion_head=dict(
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes),
+    test_cfg=dict(panoptic_on=False))
+
+# dataset settings
+image_size = (1024, 1024)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    # large scale jittering
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(0.1, 2.0),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=image_size,
+        crop_type='absolute',
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True),
+    dict(type='Pad', size=image_size, pad_val=pad_cfg),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle', img_to_float=True),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Pad', size_divisor=32, pad_val=pad_cfg),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+data = dict(
+    _delete_=True,
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py
new file mode 100755
index 0000000..f13f5e1
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py
@@ -0,0 +1,5 @@
+_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth'  # noqa
+
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py
new file mode 100755
index 0000000..33a805c
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(in_channels=[128, 256, 512, 1024]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py
new file mode 100755
index 0000000..91a180d
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py
@@ -0,0 +1,26 @@
+_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        num_heads=[6, 12, 24, 48],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536]))
+
+data = dict(samples_per_gpu=1, workers_per_gpu=1)
+
+lr_config = dict(step=[655556, 710184])
+
+max_iters = 737500
+runner = dict(type='IterBasedRunner', max_iters=max_iters)
+
+# Before 735001th iteration, we do evaluation every 5000 iterations.
+# After 735000th iteration, we do evaluation every 737500 iterations,
+# which means that we do evaluation at the end of training.'
+interval = 5000
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+evaluation = dict(
+    interval=interval,
+    dynamic_intervals=dynamic_intervals,
+    metric=['PQ', 'bbox', 'segm'])
diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py
new file mode 100755
index 0000000..b2b621c
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py
new file mode 100755
index 0000000..7b1b05a
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py
new file mode 100755
index 0000000..04b2f10
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py
@@ -0,0 +1,62 @@
+_base_ = ['./mask2former_r50_lsj_8x2_50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+
+depths = [2, 2, 6, 2]
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(
+        type='Mask2FormerHead', in_channels=[96, 192, 384, 768]),
+    init_cfg=None)
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py
new file mode 100755
index 0000000..0ccbe91
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py
@@ -0,0 +1,61 @@
+_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(
+        type='Mask2FormerHead', in_channels=[96, 192, 384, 768]),
+    init_cfg=None)
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/metafile.yml b/configs/mask2former/metafile.yml
new file mode 100755
index 0000000..d9f4692
--- /dev/null
+++ b/configs/mask2former/metafile.yml
@@ -0,0 +1,223 @@
+Collections:
+  - Name: Mask2Former
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - Mask2Former
+    Paper:
+      URL: https://arxiv.org/pdf/2112.01527
+      Title: 'Masked-attention Mask Transformer for Universal Image Segmentation'
+    README: configs/mask2former/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/detectors/mask2former.py#L7
+      Version: v2.23.0
+
+Models:
+- Name: mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 19.1
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 47.8
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.5
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 54.5
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220329_225200-c7b94355.pth
+- Name: mask2former_r101_lsj_8x2_50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py
+  Metadata:
+    Training Memory (GB): 15.5
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 46.7
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.0
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220426_100250-c50b6fa6.pth
+- Name: mask2former_r101_lsj_8x2_50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 16.1
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 45.3
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 42.4
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 52.4
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic/mask2former_r101_lsj_8x2_50e_coco-panoptic_20220329_225104-c54e64c9.pth
+- Name: mask2former_r50_lsj_8x2_50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 13.9
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 44.8
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 41.9
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 51.9
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic/mask2former_r50_lsj_8x2_50e_coco-panoptic_20220326_224516-11a44721.pth
+- Name: mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 15.9
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 46.3
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 43.4
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 53.4
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220326_224553-fc567107.pth
+- Name: mask2former_r50_lsj_8x2_50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py
+  Metadata:
+    Training Memory (GB): 13.7
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 45.7
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 42.9
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth
+- Name: mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 21.1
+    Iterations: 737500
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 52.2
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 48.5
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 57.6
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic_20220407_104949-d4919c44.pth
+- Name: mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 25.8
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 50.0
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 46.3
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 56.3
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic_20220329_230021-3bb8b482.pth
+- Name: mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 26.0
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 48.2
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.9
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 55.1
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic_20220331_002244-c149a9e9.pth
+- Name: mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py
+  Metadata:
+    Training Memory (GB): 15.3
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 47.7
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.7
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220508_091649-4a943037.pth
+- Name: mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py
+  Metadata:
+    Training Memory (GB): 18.8
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 49.3
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 46.1
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220504_001756-743b7d99.pth
diff --git a/configs/mask_rcnn/README.md b/configs/mask_rcnn/README.md
new file mode 100755
index 0000000..11a39b0
--- /dev/null
+++ b/configs/mask_rcnn/README.md
@@ -0,0 +1,59 @@
+# Mask R-CNN
+
+> [Mask R-CNN](https://arxiv.org/abs/1703.06870)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143967081-c2552bed-9af2-46c4-ae44-5b3b74e5679f.png"/>
+</div>
+
+## Results and Models
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                        Config                                                         |                                                                                                                                                                            Download                                                                                                                                                                             |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     |  caffe  |   1x    |   4.3    |                |  38.0  |  34.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.38__segm_mAP-0.344_20200504_231812-0ebd1859.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_20200504_231812.log.json)    |
+|    R-50-FPN     | pytorch |   1x    |   4.4    |      16.1      |  38.2  |  34.7   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py)     |                                  [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json)                                  |
+| R-50-FPN (FP16) | pytorch |   1x    |   3.6    |      24.1      |  38.1  |  34.7   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py)  |                             [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205-59faf7e4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205_130539.log.json)                             |
+|    R-50-FPN     | pytorch |   2x    |    -     |       -        |  39.2  |  35.4   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_20200505_003907.log.json)               |
+|    R-101-FPN    |  caffe  |   1x    |          |                |  40.4  |  36.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py) |                [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758-805e06c1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758.log.json)                 |
+|    R-101-FPN    | pytorch |   1x    |   6.4    |      13.5      |  40.0  |  36.1   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py)    |                                [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204_144809.log.json)                                |
+|    R-101-FPN    | pytorch |   2x    |    -     |       -        |  40.8  |  36.6   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_bbox_mAP-0.408__segm_mAP-0.366_20200505_071027-14b391c7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_20200505_071027.log.json)             |
+| X-101-32x4d-FPN | pytorch |   1x    |   7.6    |      11.3      |  41.9  |  37.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py) |                    [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205_034906.log.json)                    |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |       -        |  42.2  |  37.8   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.422__segm_mAP-0.378_20200506_004702-faef898c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_20200506_004702.log.json) |
+| X-101-64x4d-FPN | pytorch |   1x    |   10.7   |      8.0       |  42.8  |  38.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py) |                    [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201_124310.log.json)                    |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |       -        |  42.7  |  38.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py) |                [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208-39d6f70c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208.log.json)                 |
+| X-101-32x8d-FPN | pytorch |   1x    |   10.6   |       -        |  42.8  |  38.3   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py) |                [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco/mask_rcnn_x101_32x8d_fpn_1x_coco_20220630_173841-0aaf329e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco/mask_rcnn_x101_32x8d_fpn_1x_coco_20220630_173841.log.json)                 |
+
+## Pre-trained Models
+
+We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks.
+
+|                               Backbone                                |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                               Config                                                               |                                                                                                                                                                                                    Download                                                                                                                                                                                                     |
+| :-------------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     [R-50-FPN](./mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py)     |  caffe  |   2x    |   4.3    |                |  40.3  |  36.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_bbox_mAP-0.403__segm_mAP-0.365_20200504_231822-a75c98ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_20200504_231822.log.json) |
+|     [R-50-FPN](./mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py)     |  caffe  |   3x    |   4.3    |                |  40.8  |  37.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_20200504_163245.log.json)  |
+|        [R-50-FPN](./mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py)        | pytorch |   3x    |   4.1    |                |  40.9  |  37.1   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py)     |                            [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154.log.json)                             |
+|    [R-101-FPN](./mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py)    |  caffe  |   3x    |   5.9    |                |  42.9  |  38.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py) |                   [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339-3c33ce02.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339.log.json)                    |
+|       [R-101-FPN](./mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py)       | pytorch |   3x    |   6.1    |                |  42.7  |  38.5   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py)    |                          [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244-5675c317.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244.log.json)                           |
+| [x101-32x4d-FPN](./mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py)  | pytorch |   3x    |   7.3    |                |  43.6  |  39.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py) |              [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410-abcd7859.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410.log.json)               |
+| [X-101-32x8d-FPN](./mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py) | pytorch |   1x    |   10.4   |                |  43.4  |  39.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py) |              [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco_20220630_170346-b4637974.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco_20220630_170346.log.json)               |
+| [X-101-32x8d-FPN](./mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py) | pytorch |   3x    |   10.3   |                |  44.3  |  39.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py) |              [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042-8bd2c639.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042.log.json)               |
+| [X-101-64x4d-FPN](./mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py) | pytorch |   3x    |   10.4   |                |  44.5  |  39.7   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py) |              [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447-c376f129.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447.log.json)               |
+
+## Citation
+
+```latex
+@article{He_2017,
+   title={Mask R-CNN},
+   journal={2017 IEEE International Conference on Computer Vision (ICCV)},
+   publisher={IEEE},
+   author={He, Kaiming and Gkioxari, Georgia and Dollar, Piotr and Girshick, Ross},
+   year={2017},
+   month={Oct}
+}
+```
diff --git a/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..95b324f
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py b/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..e39781d
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,55 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..b7986e8
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py b/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
new file mode 100755
index 0000000..c9059d5
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py b/configs/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..0696cbe
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py
new file mode 100755
index 0000000..a44c018
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_caffe_c4.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..5a23f8c
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,40 @@
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py
new file mode 100755
index 0000000..6308e40
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py
@@ -0,0 +1,49 @@
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py
new file mode 100755
index 0000000..4f7150c
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..1b48a21
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py'
+# learning policy
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py
new file mode 100755
index 0000000..bebbaaa
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
new file mode 100755
index 0000000..3f8079d
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
@@ -0,0 +1,61 @@
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False))))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..6a6c924
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py b/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py
new file mode 100755
index 0000000..88c8576
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# Set evaluation interval
+evaluation = dict(interval=2)
+# Set checkpoint interval
+checkpoint_config = dict(interval=4)
+
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='MMDetWandbHook',
+             init_kwargs={
+                'project': 'mmdetection',
+                'group': 'maskrcnn-r50-fpn-1x-coco'
+             },
+             interval=50,
+             log_checkpoint=True,
+             log_checkpoint_metadata=True,
+             num_eval_images=100)
+        ])
diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
new file mode 100755
index 0000000..932b1f9
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py
new file mode 100755
index 0000000..fb8289b
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+# fp16 settings
+fp16 = dict(loss_scale=512.)
diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..b3d9242
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py b/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py
new file mode 100755
index 0000000..9eb6d57
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..a8b3799
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..2cd3cee
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_r101_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..b698a7d
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
new file mode 100755
index 0000000..108ea4e
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
@@ -0,0 +1,65 @@
+_base_ = './mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675],
+    std=[57.375, 57.120, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py
new file mode 100755
index 0000000..6b912f6
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py
@@ -0,0 +1,60 @@
+_base_ = './mask_rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675],
+    std=[57.375, 57.120, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..8ba0e9c
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,85 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675],
+    std=[57.375, 57.120, 58.395],
+    to_rgb=False)
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..2333b03
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..6074cca
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask_rcnn_x101_32x4d_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py b/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..9f9cb1c
--- /dev/null
+++ b/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/mask_rcnn/metafile.yml b/configs/mask_rcnn/metafile.yml
new file mode 100755
index 0000000..30938ea
--- /dev/null
+++ b/configs/mask_rcnn/metafile.yml
@@ -0,0 +1,443 @@
+Collections:
+  - Name: Mask R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1703.06870v3
+      Title: "Mask R-CNN"
+    README: configs/mask_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: mask_rcnn_r50_caffe_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.38__segm_mAP-0.344_20200504_231812-0ebd1859.pth
+
+  - Name: mask_rcnn_r50_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth
+
+  - Name: mask_rcnn_r50_fpn_fp16_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      inference time (ms/im):
+        - value: 41.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205-59faf7e4.pth
+
+  - Name: mask_rcnn_r50_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth
+
+  - Name: mask_rcnn_r101_caffe_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758-805e06c1.pth
+
+  - Name: mask_rcnn_r101_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth
+
+  - Name: mask_rcnn_r101_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_bbox_mAP-0.408__segm_mAP-0.366_20200505_071027-14b391c7.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.422__segm_mAP-0.378_20200506_004702-faef898c.pth
+
+  - Name: mask_rcnn_x101_64x4d_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth
+
+  - Name: mask_rcnn_x101_64x4d_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208-39d6f70c.pth
+
+  - Name: mask_rcnn_x101_32x8d_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.6
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 42.8
+    - Task: Instance Segmentation
+      Dataset: COCO
+      Metrics:
+        mask AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco/mask_rcnn_x101_32x8d_fpn_1x_coco_20220630_173841-0aaf329e.pth
+
+  - Name: mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_bbox_mAP-0.403__segm_mAP-0.365_20200504_231822-a75c98ce.pth
+
+  - Name: mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth
+
+  - Name: mask_rcnn_r50_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth
+
+  - Name: mask_rcnn_r101_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244-5675c317.pth
+
+  - Name: mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339-3c33ce02.pth
+
+  - Name: mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410-abcd7859.pth
+
+  - Name: mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.4
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 43.4
+    - Task: Instance Segmentation
+      Dataset: COCO
+      Metrics:
+        mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco_20220630_170346-b4637974.pth
+
+  - Name: mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042-8bd2c639.pth
+
+  - Name: mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Epochs: 36
+      Training Memory (GB): 10.4
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447-c376f129.pth
diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
new file mode 100755
index 0000000..5d8daa2
--- /dev/null
+++ b/configs/maskformer/README.md
@@ -0,0 +1,53 @@
+# MaskFormer
+
+> [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic  segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+
+<div align=center>
+<img src="https://camo.githubusercontent.com/29fb22298d506ce176caad3006a7b05ef2603ca12cece6c788b7e73c046e8bc9/68747470733a2f2f626f77656e63303232312e6769746875622e696f2f696d616765732f6d61736b666f726d65722e706e67" height="300"/>
+</div>
+
+## Introduction
+
+MaskFormer requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path.
+The directory should be like this.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── panoptic_train2017.json
+│   │   │   ├── panoptic_train2017
+│   │   │   ├── panoptic_val2017.json
+│   │   │   ├── panoptic_val2017
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+```
+
+## Results and Models
+
+| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th  | SQ_th  | RQ_th  | PQ_st  | SQ_st  | RQ_st  |                                                                Config                                                                 |                                                                                                                                                                                            Download                                                                                                                                                                                            |                                                                         detail                                                                          |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :-----------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |   75e   |   16.2   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 |      [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py)       |                       [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956.log.json)                       | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+|  Swin-L  | pytorch |  300e   |   27.2   |       -        | 53.249 | 81.704 | 64.231 | 58.798 | 82.923 | 70.282 | 44.874 | 79.863 | 55.097 | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco_20220326_221612-061b4eb8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco_20220326_221612.log.json) |                                                                            -                                                                            |
+
+## Citation
+
+```latex
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```
diff --git a/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py b/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
new file mode 100755
index 0000000..46b3c13
--- /dev/null
+++ b/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
@@ -0,0 +1,238 @@
+_base_ = [
+    '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
+]
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type='MaskFormer',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type='MaskFormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        pixel_decoder=dict(
+            type='TransformerEncoderPixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        attn_drop=0.1,
+                        proj_drop=0.1,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.1,
+                        dropout_layer=None,
+                        add_identity=True),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'),
+                    norm_cfg=dict(type='LN'),
+                    init_cfg=None,
+                    batch_first=False),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True)),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=6,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.1,
+                    dropout_layer=None,
+                    add_identity=True),
+                # the following parameter was not used,
+                # just make current api happy
+                feedforward_channels=2048,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0)),
+    panoptic_fusion_head=dict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        assigner=dict(
+            type='MaskHungarianAssigner',
+            cls_cost=dict(type='ClassificationCost', weight=1.0),
+            mask_cost=dict(
+                type='FocalLossCost', weight=20.0, binary_input=True),
+            dice_cost=dict(
+                type='DiceCost', weight=1.0, pred_act=True, eps=1.0)),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=False,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        object_mask_thr=0.8,
+        iou_thr=0.8,
+        # In MaskFormer's panoptic postprocessing,
+        # it will not filter masks whose score is smaller than 0.5 .
+        filter_low_score=False),
+    init_cfg=None)
+
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=1),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=1,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': dict(lr_mult=1.0, decay_mult=0.0)
+        },
+        norm_decay_mult=0.0))
+optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    gamma=0.1,
+    by_epoch=True,
+    step=[50],
+    warmup='linear',
+    warmup_by_epoch=False,
+    warmup_ratio=1.0,  # no warmup
+    warmup_iters=10)
+runner = dict(type='EpochBasedRunner', max_epochs=75)
diff --git a/configs/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py b/configs/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py
new file mode 100755
index 0000000..bc23c54
--- /dev/null
+++ b/configs/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py
@@ -0,0 +1,67 @@
+_base_ = './maskformer_r50_mstrain_16x1_75e_coco.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(
+        in_channels=[192, 384, 768, 1536],  # pass to pixel_decoder inside
+        pixel_decoder=dict(
+            _delete_=True,
+            type='PixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')),
+        enforce_decoder_input_project=True))
+
+# weight_decay = 0.01
+# norm_weight_decay = 0.0
+# embed_weight_decay = 0.0
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'norm': norm_multi,
+    'absolute_pos_embed': embed_multi,
+    'relative_position_bias_table': embed_multi,
+    'query_embed': embed_multi
+}
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=6e-5,
+    weight_decay=0.01,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    gamma=0.1,
+    by_epoch=True,
+    step=[250],
+    warmup='linear',
+    warmup_by_epoch=False,
+    warmup_ratio=1e-6,
+    warmup_iters=1500)
+runner = dict(type='EpochBasedRunner', max_epochs=300)
diff --git a/configs/maskformer/metafile.yml b/configs/maskformer/metafile.yml
new file mode 100755
index 0000000..6530fa1
--- /dev/null
+++ b/configs/maskformer/metafile.yml
@@ -0,0 +1,43 @@
+Collections:
+  - Name: MaskFormer
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 16x V100 GPUs
+      Architecture:
+        - MaskFormer
+    Paper:
+      URL: https://arxiv.org/pdf/2107.06278
+      Title: 'Per-Pixel Classification is Not All You Need for Semantic Segmentation'
+    README: configs/maskformer/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/detectors/maskformer.py#L7
+      Version: v2.22.0
+
+Models:
+  - Name: maskformer_r50_mstrain_16x1_75e_coco
+    In Collection: MaskFormer
+    Config: configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
+    Metadata:
+      Training Memory (GB): 16.2
+      Epochs: 75
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 46.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth
+  - Name: maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco
+    In Collection: MaskFormer
+    Config: configs/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco.py
+    Metadata:
+      Training Memory (GB): 27.2
+      Epochs: 300
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 53.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco_20220326_221612-061b4eb8.pth
diff --git a/configs/ms_rcnn/README.md b/configs/ms_rcnn/README.md
new file mode 100755
index 0000000..97bca05
--- /dev/null
+++ b/configs/ms_rcnn/README.md
@@ -0,0 +1,36 @@
+# MS R-CNN
+
+> [Mask Scoring R-CNN](https://arxiv.org/abs/1903.00241)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Letting a deep network be aware of the quality of its own predictions is an interesting yet important problem. In the task of instance segmentation, the confidence of instance classification is used as mask quality score in most instance segmentation frameworks. However, the mask quality, quantified as the IoU between the instance mask and its ground truth, is usually not well correlated with classification score. In this paper, we study this problem and propose Mask Scoring R-CNN which contains a network block to learn the quality of the predicted instance masks. The proposed network block takes the instance feature and the corresponding predicted mask together to regress the mask IoU. The mask scoring strategy calibrates the misalignment between mask quality and mask score, and improves instance segmentation performance by prioritizing more accurate mask predictions during COCO AP evaluation. By extensive evaluations on the COCO dataset, Mask Scoring R-CNN brings consistent and noticeable gain with different models, and outperforms the state-of-the-art Mask R-CNN. We hope our simple and effective approach will provide a new direction for improving instance segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143967239-3a95ae92-6443-4181-9cbc-dfe16e81b969.png"/>
+</div>
+
+## Results and Models
+
+|   Backbone   |  style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                      Config                                                       |                                                                                                                                                                      Download                                                                                                                                                                       |
+| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50-FPN   |  caffe  |   1x    |   4.5    |                |  38.2  |  36.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py)  |                  [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848.log.json)                   |
+|   R-50-FPN   |  caffe  |   2x    |    -     |       -        |  38.8  |  36.3   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_bbox_mAP-0.388__segm_mAP-0.363_20200506_004738-ee87b137.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_20200506_004738.log.json)   |
+|  R-101-FPN   |  caffe  |   1x    |   6.5    |                |  40.4  |  37.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.404__segm_mAP-0.376_20200506_004755-b9b12a37.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_20200506_004755.log.json) |
+|  R-101-FPN   |  caffe  |   2x    |    -     |       -        |  41.1  |  38.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_bbox_mAP-0.411__segm_mAP-0.381_20200506_011134-5f3cc74f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_20200506_011134.log.json) |
+| R-X101-32x4d | pytorch |   2x    |   7.9    |      11.0      |  41.8  |  38.7   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py) |                    [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206-81fd1740.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206_100113.log.json)                    |
+| R-X101-64x4d | pytorch |   1x    |   11.0   |      8.0       |  43.0  |  39.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py) |                    [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206-86ba88d2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206_091744.log.json)                    |
+| R-X101-64x4d | pytorch |   2x    |   11.0   |      8.0       |  42.6  |  39.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py) |                    [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308-02a445e2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308_012247.log.json)                    |
+
+## Citation
+
+```latex
+@inproceedings{huang2019msrcnn,
+    title={Mask Scoring R-CNN},
+    author={Zhaojin Huang and Lichao Huang and Yongchao Gong and Chang Huang and Xinggang Wang},
+    booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+    year={2019},
+}
+```
diff --git a/configs/ms_rcnn/metafile.yml b/configs/ms_rcnn/metafile.yml
new file mode 100755
index 0000000..a6c7dc5
--- /dev/null
+++ b/configs/ms_rcnn/metafile.yml
@@ -0,0 +1,159 @@
+Collections:
+  - Name: Mask Scoring R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RPN
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1903.00241
+      Title: 'Mask Scoring R-CNN'
+    README: configs/ms_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_scoring_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: ms_rcnn_r50_caffe_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth
+
+  - Name: ms_rcnn_r50_caffe_fpn_2x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_bbox_mAP-0.388__segm_mAP-0.363_20200506_004738-ee87b137.pth
+
+  - Name: ms_rcnn_r101_caffe_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.404__segm_mAP-0.376_20200506_004755-b9b12a37.pth
+
+  - Name: ms_rcnn_r101_caffe_fpn_2x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_bbox_mAP-0.411__segm_mAP-0.381_20200506_011134-5f3cc74f.pth
+
+  - Name: ms_rcnn_x101_32x4d_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206-81fd1740.pth
+
+  - Name: ms_rcnn_x101_64x4d_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 11.0
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206-86ba88d2.pth
+
+  - Name: ms_rcnn_x101_64x4d_fpn_2x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.0
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308-02a445e2.pth
diff --git a/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py b/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..9b7dcbb
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './ms_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py b/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py
new file mode 100755
index 0000000..202bcce
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './ms_rcnn_r101_caffe_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py b/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..5845125
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    type='MaskScoringRCNN',
+    roi_head=dict(
+        type='MaskScoringRoIHead',
+        mask_iou_head=dict(
+            type='MaskIoUHead',
+            num_convs=4,
+            num_fcs=2,
+            roi_feat_size=14,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            num_classes=80)),
+    # model training and testing settings
+    train_cfg=dict(rcnn=dict(mask_thr_binary=0.5)))
diff --git a/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py b/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py
new file mode 100755
index 0000000..008a70a
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './ms_rcnn_r50_caffe_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/ms_rcnn/ms_rcnn_r50_fpn_1x_coco.py b/configs/ms_rcnn/ms_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..0a163ce
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    type='MaskScoringRCNN',
+    roi_head=dict(
+        type='MaskScoringRoIHead',
+        mask_iou_head=dict(
+            type='MaskIoUHead',
+            num_convs=4,
+            num_fcs=2,
+            roi_feat_size=14,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            num_classes=80)),
+    # model training and testing settings
+    train_cfg=dict(rcnn=dict(mask_thr_binary=0.5)))
diff --git a/configs/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..20479bb
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ms_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py b/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..ee5b734
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ms_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py b/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..54c605b
--- /dev/null
+++ b/configs/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './ms_rcnn_x101_64x4d_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/nas_fcos/README.md b/configs/nas_fcos/README.md
new file mode 100755
index 0000000..def8831
--- /dev/null
+++ b/configs/nas_fcos/README.md
@@ -0,0 +1,35 @@
+# NAS-FCOS
+
+> [NAS-FCOS: Fast Neural Architecture Search for Object Detection](https://arxiv.org/abs/1906.04423)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The success of deep neural networks relies on significant architecture engineering. Recently neural architecture search (NAS) has emerged as a promise to greatly reduce manual effort in network design by automatically searching for optimal architectures, although typically such algorithms need an excessive amount of computational resources, e.g., a few thousand GPU-days. To date, on challenging vision tasks such as object detection, NAS, especially fast versions of NAS, is less studied. Here we propose to search for the decoder structure of object detectors with search efficiency being taken into consideration. To be more specific, we aim to efficiently search for the feature pyramid network (FPN) as well as the prediction head of a simple anchor-free object detector, namely FCOS, using a tailored reinforcement learning paradigm. With carefully designed search space, search algorithms and strategies for evaluating network quality, we are able to efficiently search a top-performing detection architecture within 4 days using 8 V100 GPUs. The discovered architecture surpasses state-of-the-art object detection models (such as Faster R-CNN, RetinaNet and FCOS) by 1.5 to 3.5 points in AP on the COCO dataset, with comparable computation complexity and memory footprint, demonstrating the efficacy of the proposed NAS for object detection.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143967900-1c8a65b9-c58d-4b03-8900-96af8f9768e8.png"/>
+</div>
+
+## Results and Models
+
+|     Head     | Backbone | Style | GN-head | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                 Config                                                                  |                                                                                                                                                                                           Download                                                                                                                                                                                           |
+| :----------: | :------: | :---: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| NAS-FCOSHead |   R-50   | caffe |    Y    |   1x    |          |                |  39.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520.log.json)   |
+|   FCOSHead   |   R-50   | caffe |    Y    |   1x    |          |                |  38.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521-7fdcbce0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521.log.json) |
+
+**Notes:**
+
+- To be consistent with the author's implementation, we use 4 GPUs with 4 images/GPU.
+
+## Citation
+
+```latex
+@article{wang2019fcos,
+  title={Nas-fcos: Fast neural architecture search for object detection},
+  author={Wang, Ning and Gao, Yang and Chen, Hao and Wang, Peng and Tian, Zhi and Shen, Chunhua},
+  journal={arXiv preprint arXiv:1906.04423},
+  year={2019}
+}
+```
diff --git a/configs/nas_fcos/metafile.yml b/configs/nas_fcos/metafile.yml
new file mode 100755
index 0000000..1ea28cf
--- /dev/null
+++ b/configs/nas_fcos/metafile.yml
@@ -0,0 +1,44 @@
+Collections:
+  - Name: NAS-FCOS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 4x V100 GPUs
+      Architecture:
+        - FPN
+        - NAS-FCOS
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1906.04423
+      Title: 'NAS-FCOS: Fast Neural Architecture Search for Object Detection'
+    README: configs/nas_fcos/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/nasfcos.py#L6
+      Version: v2.1.0
+
+Models:
+  - Name: nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco
+    In Collection: NAS-FCOS
+    Config: configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth
+
+  - Name: nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco
+    In Collection: NAS-FCOS
+    Config: configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521-7fdcbce0.pth
diff --git a/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
new file mode 100755
index 0000000..a455c92
--- /dev/null
+++ b/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -0,0 +1,100 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='NASFCOS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False, eps=0),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='NASFCOS_FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5,
+        norm_cfg=dict(type='BN'),
+        conv_cfg=dict(type='DCNv2', deform_groups=2)),
+    bbox_head=dict(
+        type='FCOSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_cfg=dict(type='GN', num_groups=32),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+optimizer = dict(
+    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
diff --git a/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
new file mode 100755
index 0000000..b779492
--- /dev/null
+++ b/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -0,0 +1,99 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='NASFCOS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False, eps=0),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='NASFCOS_FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5,
+        norm_cfg=dict(type='BN'),
+        conv_cfg=dict(type='DCNv2', deform_groups=2)),
+    bbox_head=dict(
+        type='NASFCOSHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_cfg=dict(type='GN', num_groups=32),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+optimizer = dict(
+    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
diff --git a/configs/nas_fpn/README.md b/configs/nas_fpn/README.md
new file mode 100755
index 0000000..c5acf40
--- /dev/null
+++ b/configs/nas_fpn/README.md
@@ -0,0 +1,36 @@
+# NAS-FPN
+
+> [NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection](https://arxiv.org/abs/1904.07392)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Current state-of-the-art convolutional architectures for object detection are manually designed. Here we aim to learn a better architecture of feature pyramid network for object detection. We adopt Neural Architecture Search and discover a new feature pyramid architecture in a novel scalable search space covering all cross-scale connections. The discovered architecture, named NAS-FPN, consists of a combination of top-down and bottom-up connections to fuse features across scales. NAS-FPN, combined with various backbone models in the RetinaNet framework, achieves better accuracy and latency tradeoff compared to state-of-the-art object detection models. NAS-FPN improves mobile detection accuracy by 2 AP compared to state-of-the-art SSDLite with MobileNetV2 model in \[32\] and achieves 48.3 AP which surpasses Mask R-CNN \[10\] detection accuracy with less computation time.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143968037-cedd76e9-1ae7-4869-bd34-c9d8611d630c.png"/>
+</div>
+
+## Results and Models
+
+We benchmark the new training schedule (crop training, large batch, unfrozen BN, 50 epochs) introduced in NAS-FPN. RetinaNet is used in the paper.
+
+|  Backbone   | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                          Config                                                          |                                                                                                                                                             Download                                                                                                                                                             |
+| :---------: | :-----: | :------: | :------------: | :----: | :----------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  R-50-FPN   |   50e   |   12.9   |      22.9      |  37.9  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py)   |       [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco-9b953d76.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco_20200529_095329.log.json)       |
+| R-50-NASFPN |   50e   |   13.2   |      23.0      |  40.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco_20200528_230008.log.json) |
+
+**Note**: We find that it is unstable to train NAS-FPN and there is a small chance that results can be 3% mAP lower.
+
+## Citation
+
+```latex
+@inproceedings{ghiasi2019fpn,
+  title={Nas-fpn: Learning scalable feature pyramid architecture for object detection},
+  author={Ghiasi, Golnaz and Lin, Tsung-Yi and Le, Quoc V},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={7036--7045},
+  year={2019}
+}
+```
diff --git a/configs/nas_fpn/metafile.yml b/configs/nas_fpn/metafile.yml
new file mode 100755
index 0000000..ab8d649
--- /dev/null
+++ b/configs/nas_fpn/metafile.yml
@@ -0,0 +1,59 @@
+Collections:
+  - Name: NAS-FPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - NAS-FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.07392
+      Title: 'NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection'
+    README: configs/nas_fpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/nas_fpn.py#L67
+      Version: v2.0.0
+
+Models:
+  - Name: retinanet_r50_fpn_crop640_50e_coco
+    In Collection: NAS-FPN
+    Config: configs/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 12.9
+      inference time (ms/im):
+        - value: 43.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco-9b953d76.pth
+
+  - Name: retinanet_r50_nasfpn_crop640_50e_coco
+    In Collection: NAS-FPN
+    Config: configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 13.2
+      inference time (ms/im):
+        - value: 43.48
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth
diff --git a/configs/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py b/configs/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py
new file mode 100755
index 0000000..e4408fe
--- /dev/null
+++ b/configs/nas_fpn/retinanet_r50_fpn_crop640_50e_coco.py
@@ -0,0 +1,85 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+cudnn_benchmark = True
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        relu_before_extra_convs=True,
+        no_norm_on_lateral=True,
+        norm_cfg=norm_cfg),
+    bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(neg_iou_thr=0.5)))
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=(640, 640)),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=64),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.08,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.1,
+    step=[30, 40])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py b/configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py
new file mode 100755
index 0000000..1387a10
--- /dev/null
+++ b/configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py
@@ -0,0 +1,84 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+cudnn_benchmark = True
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(type='NASFPN', stack_times=7, norm_cfg=norm_cfg),
+    bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(neg_iou_thr=0.5)))
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=(640, 640)),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=128),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.08,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.1,
+    step=[30, 40])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/objects365/README.md b/configs/objects365/README.md
new file mode 100755
index 0000000..b685f9b
--- /dev/null
+++ b/configs/objects365/README.md
@@ -0,0 +1,102 @@
+# Objects365 Dataset
+
+> [Objects365 Dataset](https://openaccess.thecvf.com/content_ICCV_2019/papers/Shao_Objects365_A_Large-Scale_High-Quality_Dataset_for_Object_Detection_ICCV_2019_paper.pdf)
+
+<!-- [DATASET] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+#### Objects365 Dataset V1
+
+[Objects365 Dataset V1](http://www.objects365.org/overview.html) is a brand new dataset,
+designed to spur object detection research with a focus on diverse objects in the Wild.
+It has 365 object categories over 600K training images. More than 10 million, high-quality bounding boxes are manually labeled through a three-step, carefully designed annotation pipeline. It is the largest object detection dataset (with full annotation) so far and establishes a more challenging benchmark for the community. Objects365 can serve as a better feature learning dataset for localization-sensitive tasks like object detection
+and semantic segmentation.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/48282753/208368046-b7573022-06c9-4a99-af17-a6ac7407e3d8.png" height="400"/>
+</div>
+
+#### Objects365 Dataset V2
+
+[Objects365 Dataset V2](http://www.objects365.org/overview.html) is based on the V1 release of the Objects365 dataset.
+Objects 365 annotated 365 object classes on more than 1800k images, with more than 29 million bounding boxes in the training set, surpassing PASCAL VOC, ImageNet, and COCO datasets.
+Objects 365 includes 11 categories of people, clothing, living room, bathroom, kitchen, office/medical, electrical appliances, transportation, food, animals, sports/musical instruments, and each category has dozens of subcategories.
+
+## Citation
+
+```
+@inproceedings{shao2019objects365,
+  title={Objects365: A large-scale, high-quality dataset for object detection},
+  author={Shao, Shuai and Li, Zeming and Zhang, Tianyuan and Peng, Chao and Yu, Gang and Zhang, Xiangyu and Li, Jing and Sun, Jian},
+  booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
+  pages={8430--8439},
+  year={2019}
+}
+```
+
+## Prepare Dataset
+
+1. You need to download and extract Objects365 dataset. Users can download  Objects365 V2 by using `tools/misc/download_dataset.py`.
+
+   **Usage**
+
+   ```shell
+   python tools/misc/download_dataset.py --dataset-name objects365v2 \
+   --save-dir ${SAVING PATH} \
+   --unzip \
+   --delete  # Optional, delete the download zip file
+   ```
+
+   **Note:** There is no download link for Objects365 V1 right now. If you would like to download Objects365-V1, please visit [official website](http://www.objects365.org/) to concat the author.
+
+2. The directory should be like this:
+
+   ```none
+   mmdetection
+   ├── mmdet
+   ├── tools
+   ├── configs
+   ├── data
+   │   ├── Objects365
+   │   │   ├── Obj365_v1
+   │   │   │   ├── annotations
+   │   │   │   │   ├── objects365_train.json
+   │   │   │   │   ├── objects365_val.json
+   │   │   │   ├── train        # training images
+   │   │   │   ├── val          # validation images
+   │   │   ├── Obj365_v2
+   │   │   │   ├── annotations
+   │   │   │   │   ├── zhiyuan_objv2_train.json
+   │   │   │   │   ├── zhiyuan_objv2_val.json
+   │   │   │   ├── train        # training images
+   │   │   │   │   ├── patch0
+   │   │   │   │   ├── patch1
+   │   │   │   │   ├── ...
+   │   │   │   ├── val          # validation images
+   │   │   │   │   ├── patch0
+   │   │   │   │   ├── patch1
+   │   │   │   │   ├── ...
+   ```
+
+## Results and Models
+
+### Objects365 V1
+
+| Architecture | Backbone |  Style  | Lr schd | Mem (GB) | box AP |                                                             Config                                                              |                                                                                                                                                                                Download                                                                                                                                                                                |
+| :----------: | :------: | :-----: | :-----: | :------: | :----: | :-----------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Faster R-CNN |   R-50   | pytorch |   1x    |    -     |  19.6  |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1.py)    |           [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1/faster_rcnn_r50_fpn_16x4_1x_obj365v1_20221219_181226-9ff10f95.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1/faster_rcnn_r50_fpn_16x4_1x_obj365v1_20221219_181226.log.json)           |
+| Faster R-CNN |   R-50   | pytorch |  1350K  |    -     |  22.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1_20220510_142457-337d8965.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1_20220510_142457.log.json) |
+|  Retinanet   |   R-50   | pytorch |   1x    |    -     |  14.8  |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/objects365/retinanet_r50_fpn_1x_obj365v1.py)       |                         [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v1/retinanet_r50_fpn_1x_obj365v1_20221219_181859-ba3e3dd5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v1/retinanet_r50_fpn_1x_obj365v1_20221219_181859.log.json)                         |
+|  Retinanet   |   R-50   | pytorch |  1350K  |    -     |  18.0  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1.py)  |     [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1/retinanet_r50_fpn_syncbn_1350k_obj365v1_20220513_111237-7517c576.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1/retinanet_r50_fpn_syncbn_1350k_obj365v1_20220513_111237.log.json)     |
+
+### Objects365 V2
+
+| Architecture | Backbone |  Style  | Lr schd | Mem (GB) | box AP |                                                           Config                                                           |                                                                                                                                                                      Download                                                                                                                                                                      |
+| :----------: | :------: | :-----: | :-----: | :------: | :----: | :------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Faster R-CNN |   R-50   | pytorch |   1x    |    -     |  19.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040-5910b015.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040.log.json) |
+|  Retinanet   |   R-50   | pytorch |   1x    |    -     |  16.7  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/objects365/retinanet_r50_fpn_1x_obj365v2.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v2/retinanet_r50_fpn_1x_obj365v2_20221223_122105-d9b191f1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v2/retinanet_r50_fpn_1x_obj365v2_20221223_122105.log.json)               |
diff --git a/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1.py b/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1.py
new file mode 100755
index 0000000..36bfa27
--- /dev/null
+++ b/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/objects365v1_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=365)))
+
+data = dict(samples_per_gpu=4)
+
+# Using 32 GPUS while training
+optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (4 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2.py b/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2.py
new file mode 100755
index 0000000..13bbeb8
--- /dev/null
+++ b/configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/objects365v2_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=365)))
+
+data = dict(samples_per_gpu=4)
+
+# Using 32 GPUS while training
+optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (4 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1.py b/configs/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1.py
new file mode 100755
index 0000000..4e6f341
--- /dev/null
+++ b/configs/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/objects365v1_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(norm_cfg=norm_cfg),
+    roi_head=dict(bbox_head=dict(num_classes=365)))
+
+# Using 8 GPUS while training
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+
+runner = dict(
+    _delete_=True, type='IterBasedRunner', max_iters=1350000)  # 36 epochs
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[900000, 1200000])
+
+checkpoint_config = dict(interval=150000)
+evaluation = dict(interval=150000, metric='bbox')
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/configs/objects365/metafile.yml b/configs/objects365/metafile.yml
new file mode 100755
index 0000000..5e71ad7
--- /dev/null
+++ b/configs/objects365/metafile.yml
@@ -0,0 +1,101 @@
+- Name: retinanet_r50_fpn_1x_obj365v1
+  In Collection: RetinaNet
+  Config: configs/objects365/retinanet_r50_fpn_1x_obj365v1.py
+  Metadata:
+    Training Memory (GB): 7.4
+    Epochs: 12
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 14.8
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v1/retinanet_r50_fpn_1x_obj365v1_20221219_181859-ba3e3dd5.pth
+
+- Name: retinanet_r50_fpn_syncbn_1350k_obj365v1
+  In Collection: RetinaNet
+  Config: configs/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1.py
+  Metadata:
+    Training Memory (GB): 7.6
+    Iterations: 1350000
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 18.0
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1/retinanet_r50_fpn_syncbn_1350k_obj365v1_20220513_111237-7517c576.pth
+
+- Name: retinanet_r50_fpn_1x_obj365v2
+  In Collection: RetinaNet
+  Config: configs/objects365/retinanet_r50_fpn_1x_obj365v2.py
+  Metadata:
+    Training Memory (GB): 7.2
+    Epochs: 12
+    Training Data: Objects365 v2
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v2
+    Metrics:
+      box AP: 16.7
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v2/retinanet_r50_fpn_1x_obj365v2_20221223_122105-d9b191f1.pth
+
+- Name: faster_rcnn_r50_fpn_16x4_1x_obj365v1
+  In Collection: Faster R-CNN
+  Config: configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1.py
+  Metadata:
+    Training Memory (GB): 11.4
+    Epochs: 12
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 19.6
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1/faster_rcnn_r50_fpn_16x4_1x_obj365v1_20221219_181226-9ff10f95.pth
+
+- Name: faster_rcnn_r50_fpn_syncbn_1350k_obj365v1
+  In Collection: Faster R-CNN
+  Config: configs/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1.py
+  Metadata:
+    Training Memory (GB): 8.6
+    Iterations: 1350000
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 22.3
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1_20220510_142457-337d8965.pth
+
+- Name: faster_rcnn_r50_fpn_16x4_1x_obj365v2
+  In Collection: Faster R-CNN
+  Config: configs/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2.py
+  Metadata:
+    Training Memory (GB): 10.8
+    Epochs: 12
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v2
+    Metrics:
+      box AP: 19.8
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040-5910b015.pth
diff --git a/configs/objects365/retinanet_r50_fpn_1x_obj365v1.py b/configs/objects365/retinanet_r50_fpn_1x_obj365v1.py
new file mode 100755
index 0000000..080c02b
--- /dev/null
+++ b/configs/objects365/retinanet_r50_fpn_1x_obj365v1.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/objects365v1_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(bbox_head=dict(num_classes=365))
+
+# Using 8 GPUS while training
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=10000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/configs/objects365/retinanet_r50_fpn_1x_obj365v2.py b/configs/objects365/retinanet_r50_fpn_1x_obj365v2.py
new file mode 100755
index 0000000..9f0db00
--- /dev/null
+++ b/configs/objects365/retinanet_r50_fpn_1x_obj365v2.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/objects365v2_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(bbox_head=dict(num_classes=365))
+
+# Using 8 GPUS while training
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=10000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/configs/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1.py b/configs/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1.py
new file mode 100755
index 0000000..6dd9277
--- /dev/null
+++ b/configs/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1.py
@@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/objects365v1_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(backbone=dict(norm_cfg=norm_cfg), bbox_head=dict(num_classes=365))
+
+# Using 8 GPUS while training
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+
+runner = dict(
+    _delete_=True, type='IterBasedRunner', max_iters=1350000)  # 36 epochs
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=10000,
+    warmup_ratio=1.0 / 1000,
+    step=[900000, 1200000])
+
+checkpoint_config = dict(interval=150000)
+evaluation = dict(interval=150000, metric='bbox')
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/configs/openimages/README.md b/configs/openimages/README.md
new file mode 100755
index 0000000..e5c1c27
--- /dev/null
+++ b/configs/openimages/README.md
@@ -0,0 +1,148 @@
+# Open Images Dataset
+
+> [Open Images Dataset](https://arxiv.org/abs/1811.00982)
+
+<!-- [DATASET] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+#### Open Images v6
+
+[Open Images](https://storage.googleapis.com/openimages/web/index.html) is a dataset of ~9M images annotated with image-level labels,
+object bounding boxes, object segmentation masks, visual relationships,
+and localized narratives:
+
+- It contains a total of 16M bounding boxes for 600 object classes on
+  1.9M images, making it the largest existing dataset with object location
+  annotations. The boxes have been largely manually drawn by professional
+  annotators to ensure accuracy and consistency. The images are very diverse
+  and often contain complex scenes with several objects (8.3 per image on
+  average).
+
+- Open Images also offers visual relationship annotations, indicating pairs
+  of objects in particular relations (e.g. "woman playing guitar", "beer on
+  table"), object properties (e.g. "table is wooden"), and human actions (e.g.
+  "woman is jumping"). In total it has 3.3M annotations from 1,466 distinct
+  relationship triplets.
+
+- In V5 we added segmentation masks for 2.8M object instances in 350 classes.
+  Segmentation masks mark the outline of objects, which characterizes their
+  spatial extent to a much higher level of detail.
+
+- In V6 we added 675k localized narratives: multimodal descriptions of images
+  consisting of synchronized voice, text, and mouse traces over the objects being
+  described. (Note we originally launched localized narratives only on train in V6,
+  but since July 2020 we also have validation and test covered.)
+
+- Finally, the dataset is annotated with 59.9M image-level labels spanning 19,957
+  classes.
+
+We believe that having a single dataset with unified annotations for image
+classification, object detection, visual relationship detection, instance
+segmentation, and multimodal image descriptions will enable to study these
+tasks jointly and stimulate progress towards genuine scene understanding.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/48282753/147199750-23e17230-c0cf-49a0-a13c-0d014d49107e.png" height="400"/>
+</div>
+
+#### Open Images Challenge 2019
+
+[Open Images Challenges 2019](https://storage.googleapis.com/openimages/web/challenge2019.html) is based on the V5 release of the Open
+Images dataset. The images of the dataset are very varied and
+often contain complex scenes with several objects (explore the dataset).
+
+## Citation
+
+```
+@article{OpenImages,
+  author = {Alina Kuznetsova and Hassan Rom and Neil Alldrin and Jasper Uijlings and Ivan Krasin and Jordi Pont-Tuset and Shahab Kamali and Stefan Popov and Matteo Malloci and Alexander Kolesnikov and Tom Duerig and Vittorio Ferrari},
+  title = {The Open Images Dataset V4: Unified image classification, object detection, and visual relationship detection at scale},
+  year = {2020},
+  journal = {IJCV}
+}
+```
+
+## Prepare Dataset
+
+1. You need to download and extract Open Images dataset.
+
+2. The Open Images dataset does not have image metas (width and height of the image),
+   which will be used during evaluation. We suggest to get test image metas before
+   training/testing by using `tools/misc/get_image_metas.py`.
+
+   **Usage**
+
+   ```shell
+   python tools/misc/get_image_metas.py ${CONFIG} \
+   --out ${OUTPUT FILE NAME}
+   ```
+
+3. The directory should be like this:
+
+   ```none
+   mmdetection
+   ├── mmdet
+   ├── tools
+   ├── configs
+   ├── data
+   │   ├── OpenImages
+   │   │   ├── annotations
+   │   │   │   ├── bbox_labels_600_hierarchy.json
+   │   │   │   ├── class-descriptions-boxable.csv
+   │   │   │   ├── oidv6-train-annotations-bbox.scv
+   │   │   │   ├── validation-annotations-bbox.csv
+   │   │   │   ├── validation-annotations-human-imagelabels-boxable.csv
+   │   │   │   ├── validation-image-metas.pkl      # get from script
+   │   │   ├── challenge2019
+   │   │   │   ├── challenge-2019-train-detection-bbox.txt
+   │   │   │   ├── challenge-2019-validation-detection-bbox.txt
+   │   │   │   ├── class_label_tree.np
+   │   │   │   ├── class_sample_train.pkl
+   │   │   │   ├── challenge-2019-validation-detection-human-imagelabels.csv       # download from official website
+   │   │   │   ├── challenge-2019-validation-metas.pkl     # get from script
+   │   │   ├── OpenImages
+   │   │   │   ├── train           # training images
+   │   │   │   ├── test            # testing images
+   │   │   │   ├── validation      # validation images
+   ```
+
+**Note**:
+
+1. The training and validation images of Open Images Challenge dataset are based on
+   Open Images v6, but the test images are different.
+2. The Open Images Challenges annotations are obtained from [TSD](https://github.com/Sense-X/TSD).
+   You can also download the annotations from [official website](https://storage.googleapis.com/openimages/web/challenge2019_downloads.html),
+   and set data.train.type=OpenImagesDataset, data.val.type=OpenImagesDataset, and data.test.type=OpenImagesDataset in the config
+3. If users do not want to use `validation-annotations-human-imagelabels-boxable.csv` and `challenge-2019-validation-detection-human-imagelabels.csv`
+   users can set `data.val.load_image_level_labels=False` and `data.test.load_image_level_labels=False` in the config.
+   Please note that loading image-levels label is the default of Open Images evaluation metric.
+   More details please refer to the [official website](https://storage.googleapis.com/openimages/web/evaluation.html)
+
+## Results and Models
+
+|         Architecture          | Backbone |  Style  | Lr schd |       Sampler       | Mem (GB) | Inf time (fps) | box AP |                                                                   Config                                                                   |                                                                                                                                                                                                      Download                                                                                                                                                                                                      |
+| :---------------------------: | :------: | :-----: | :-----: | :-----------------: | :------: | :------------: | :----: | :----------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|         Faster R-CNN          |   R-50   | pytorch |   1x    |    Group Sampler    |   7.7    |       -        |  51.6  |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py)        |                             [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159.log.json)                             |
+|         Faster R-CNN          |   R-50   | pytorch |   1x    | Class Aware Sampler |   7.7    |       -        |  60.0  |      [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py)      |                       [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424.log.json)                       |
+| Faster R-CNN (Challenge 2019) |   R-50   | pytorch |   1x    |    Group Sampler    |   7.7    |       -        |  54.9  |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py)   |         [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100-0e79e5df.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100.log.json)         |
+| Faster R-CNN (Challenge 2019) |   R-50   | pytorch |   1x    | Class Aware Sampler |   7.1    |       -        |  65.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021-34c402d9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021.log.json) |
+|           Retinanet           |   R-50   | pytorch |   1x    |    Group Sampler    |   6.6    |       -        |  61.5  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py)         |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954.log.json)                                 |
+|              SSD              |  VGG16   | pytorch |   36e   |    Group Sampler    |   10.8   |       -        |  35.4  |              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/ssd300_32x8_36e_openimages.py)              |                                                     [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232.log.json)                                                     |
+
+**Notes:**
+
+- 'cas' is short for 'Class Aware Sampler'
+
+### Results of consider image level labels
+
+|           Architecture            |       Sampler       | Consider Image Level Labels | box AP |
+| :-------------------------------: | :-----------------: | :-------------------------: | :----: |
+| Faster R-CNN r50 (Challenge 2019) |    Group Sampler    |             w/o             | 62.19  |
+| Faster R-CNN r50 (Challenge 2019) |    Group Sampler    |             w/              | 54.87  |
+| Faster R-CNN r50 (Challenge 2019) | Class Aware Sampler |             w/o             | 71.77  |
+| Faster R-CNN r50 (Challenge 2019) | Class Aware Sampler |             w/              | 64.98  |
diff --git a/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py
new file mode 100755
index 0000000..3dfc341
--- /dev/null
+++ b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/openimages_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=601)))
+
+# Using 32 GPUS while training
+optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=26000,
+    warmup_ratio=1.0 / 64,
+    step=[8, 11])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py
new file mode 100755
index 0000000..c8900ad
--- /dev/null
+++ b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py
@@ -0,0 +1,47 @@
+_base_ = ['faster_rcnn_r50_fpn_32x2_1x_openimages.py']
+
+model = dict(
+    roi_head=dict(bbox_head=dict(num_classes=500)),
+    test_cfg=dict(rcnn=dict(score_thr=0.01)))
+
+# dataset settings
+dataset_type = 'OpenImagesChallengeDataset'
+data_root = 'data/OpenImages/'
+data = dict(
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'challenge2019/challenge-2019-train-detection-bbox.txt',
+        img_prefix=data_root + 'OpenImages/',
+        label_file=data_root + 'challenge2019/cls-label-description.csv',
+        hierarchy_file=data_root + 'challenge2019/class_label_tree.np'),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-bbox.txt',
+        img_prefix=data_root + 'OpenImages/',
+        label_file=data_root + 'challenge2019/cls-label-description.csv',
+        hierarchy_file=data_root + 'challenge2019/class_label_tree.np',
+        meta_file=data_root +
+        'challenge2019/challenge-2019-validation-metas.pkl',
+        image_level_ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-'
+        'human-imagelabels.csv'),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-bbox.txt',
+        img_prefix=data_root + 'OpenImages/',
+        label_file=data_root + 'challenge2019/cls-label-description.csv',
+        hierarchy_file=data_root + 'challenge2019/class_label_tree.np',
+        meta_file=data_root +
+        'challenge2019/challenge-2019-validation-metas.pkl',
+        image_level_ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-'
+        'human-imagelabels.csv'))
+evaluation = dict(interval=1, metric='mAP')
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py b/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py
new file mode 100755
index 0000000..88d029d
--- /dev/null
+++ b/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py
@@ -0,0 +1,5 @@
+_base_ = ['faster_rcnn_r50_fpn_32x2_1x_openimages.py']
+
+# Use ClassAwareSampler
+data = dict(
+    train_dataloader=dict(class_aware_sampler=dict(num_sample_class=1)))
diff --git a/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py b/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py
new file mode 100755
index 0000000..26bd64e
--- /dev/null
+++ b/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py
@@ -0,0 +1,5 @@
+_base_ = ['faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py']
+
+# Use ClassAwareSampler
+data = dict(
+    train_dataloader=dict(class_aware_sampler=dict(num_sample_class=1)))
diff --git a/configs/openimages/metafile.yml b/configs/openimages/metafile.yml
new file mode 100755
index 0000000..d9f924e
--- /dev/null
+++ b/configs/openimages/metafile.yml
@@ -0,0 +1,102 @@
+Models:
+  - Name: faster_rcnn_r50_fpn_32x2_1x_openimages
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+      Training Data: Open Images v6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 51.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth
+
+  - Name: retinanet_r50_fpn_32x2_1x_openimages
+    In Collection: RetinaNet
+    Config: configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py
+    Metadata:
+      Training Memory (GB): 6.6
+      Epochs: 12
+      Training Data: Open Images v6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 61.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth
+
+  - Name: ssd300_32x8_36e_openimages
+    In Collection: SSD
+    Config: configs/openimages/ssd300_32x8_36e_openimages.py
+    Metadata:
+      Training Memory (GB): 10.8
+      Epochs: 36
+      Training Data: Open Images v6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 35.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth
+
+  - Name: faster_rcnn_r50_fpn_32x2_1x_openimages_challenge
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+      Training Data: Open Images Challenge 2019
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images Challenge 2019
+        Metrics:
+          box AP: 54.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100-0e79e5df.pth
+
+  - Name: faster_rcnn_r50_fpn_32x2_cas_1x_openimages
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+      Training Data: Open Images Challenge 2019
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images Challenge 2019
+        Metrics:
+          box AP: 60.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth
+
+  - Name: faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge.py
+    Metadata:
+      Training Memory (GB): 7.1
+      Epochs: 12
+      Training Data: Open Images Challenge 2019
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images Challenge 2019
+        Metrics:
+          box AP: 65.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021-34c402d9.pth
diff --git a/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py b/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py
new file mode 100755
index 0000000..0191aa1
--- /dev/null
+++ b/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/openimages_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(bbox_head=dict(num_classes=601))
+
+optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=26000,
+    warmup_ratio=1.0 / 64,
+    step=[8, 11])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/openimages/ssd300_32x8_36e_openimages.py b/configs/openimages/ssd300_32x8_36e_openimages.py
new file mode 100755
index 0000000..e2565b9
--- /dev/null
+++ b/configs/openimages/ssd300_32x8_36e_openimages.py
@@ -0,0 +1,83 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/openimages_detection.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py'
+]
+model = dict(
+    bbox_head=dict(
+        num_classes=601,
+        anchor_generator=dict(basesize_ratio_range=(0.2, 0.9))))
+# dataset settings
+dataset_type = 'OpenImagesDataset'
+data_root = 'data/OpenImages/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, normed_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,  # using 32 GPUS while training.
+    workers_per_gpu=0,  # workers_per_gpu > 0 may occur out of memory
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root +
+            'annotations/oidv6-train-annotations-bbox.csv',
+            img_prefix=data_root + 'OpenImages/train/',
+            label_file=data_root +
+            'annotations/class-descriptions-boxable.csv',
+            hierarchy_file=data_root +
+            'annotations/bbox_labels_600_hierarchy.json',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.04, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=20000,
+    warmup_ratio=0.001,
+    step=[8, 11])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/paa/README.md b/configs/paa/README.md
new file mode 100755
index 0000000..c8861ec
--- /dev/null
+++ b/configs/paa/README.md
@@ -0,0 +1,47 @@
+# PAA
+
+> [Probabilistic Anchor Assignment with IoU Prediction for Object Detection](https://arxiv.org/abs/2007.08103)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In object detection, determining which anchors to assign as positive or negative samples, known as anchor assignment, has been revealed as a core procedure that can significantly affect a model's performance. In this paper we propose a novel anchor assignment strategy that adaptively separates anchors into positive and negative samples for a ground truth bounding box according to the model's learning status such that it is able to reason about the separation in a probabilistic manner. To do so we first calculate the scores of anchors conditioned on the model and fit a probability distribution to these scores. The model is then trained with anchors separated into positive and negative samples according to their probabilities. Moreover, we investigate the gap between the training and testing objectives and propose to predict the Intersection-over-Unions of detected boxes as a measure of localization quality to reduce the discrepancy. The combined score of classification and localization qualities serving as a box selection metric in non-maximum suppression well aligns with the proposed anchor assignment strategy and leads significant performance improvements. The proposed methods only add a single convolutional layer to RetinaNet baseline and does not require multiple anchors per location, so are efficient. Experimental results verify the effectiveness of the proposed methods. Especially, our models set new records for single-stage detectors on MS COCO test-dev dataset with various backbones.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143968195-519a116a-de29-437e-b4c8-30aef43dcb15.png"/>
+</div>
+
+## Results and Models
+
+We provide config files to reproduce the object detection results in the
+ECCV 2020 paper for Probabilistic Anchor Assignment with IoU
+Prediction for Object Detection.
+
+| Backbone  | Lr schd | Mem (GB) | Score voting | box AP |                                                   Config                                                    |                                                                                                                                               Download                                                                                                                                               |
+| :-------: | :-----: | :------: | :----------: | :----: | :---------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN  |   12e   |   3.7    |     True     |  40.4  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_1x_coco.py)      |                     [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.log.json)                      |
+| R-50-FPN  |   12e   |   3.7    |    False     |  40.2  |                                                      -                                                      |                                                                                                                                                                                                                                                                                                      |
+| R-50-FPN  |   18e   |   3.7    |     True     |  41.4  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_1.5x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.log.json)                  |
+| R-50-FPN  |   18e   |   3.7    |    False     |  41.2  |                                                      -                                                      |                                                                                                                                                                                                                                                                                                      |
+| R-50-FPN  |   24e   |   3.7    |     True     |  41.6  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_2x_coco.py)      |                     [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.log.json)                      |
+| R-50-FPN  |   36e   |   3.7    |     True     |  43.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r50_fpn_mstrain_3x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722-06a6880b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722.log.json)   |
+| R-101-FPN |   12e   |   6.2    |     True     |  42.6  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r101_fpn_1x_coco.py)     |                   [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.log.json)                    |
+| R-101-FPN |   12e   |   6.2    |    False     |  42.4  |                                                      -                                                      |                                                                                                                                                                                                                                                                                                      |
+| R-101-FPN |   24e   |   6.2    |     True     |  43.5  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r101_fpn_2x_coco.py)     |                   [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.log.json)                    |
+| R-101-FPN |   36e   |   6.2    |     True     |  45.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/paa/paa_r101_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202-83250d22.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202.log.json) |
+
+**Note**:
+
+1. We find that the performance is unstable with 1x setting and may fluctuate by about 0.2 mAP. We report the best results.
+
+## Citation
+
+```latex
+@inproceedings{paa-eccv2020,
+  title={Probabilistic Anchor Assignment with IoU Prediction for Object Detection},
+  author={Kim, Kang and Lee, Hee Seok},
+  booktitle = {ECCV},
+  year={2020}
+}
+```
diff --git a/configs/paa/metafile.yml b/configs/paa/metafile.yml
new file mode 100755
index 0000000..e08b663
--- /dev/null
+++ b/configs/paa/metafile.yml
@@ -0,0 +1,104 @@
+Collections:
+  - Name: PAA
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Probabilistic Anchor Assignment
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2007.08103
+      Title: 'Probabilistic Anchor Assignment with IoU Prediction for Object Detection'
+    README: configs/paa/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/paa.py#L6
+      Version: v2.4.0
+
+Models:
+  - Name: paa_r50_fpn_1x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth
+
+  - Name: paa_r50_fpn_1.5x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_1.5x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.pth
+
+  - Name: paa_r50_fpn_2x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.pth
+
+  - Name: paa_r50_fpn_mstrain_3x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722-06a6880b.pth
+
+  - Name: paa_r101_fpn_1x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth
+
+  - Name: paa_r101_fpn_2x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.pth
+
+  - Name: paa_r101_fpn_mstrain_3x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r101_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202-83250d22.pth
diff --git a/configs/paa/paa_r101_fpn_1x_coco.py b/configs/paa/paa_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..94f1c27
--- /dev/null
+++ b/configs/paa/paa_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/paa/paa_r101_fpn_2x_coco.py b/configs/paa/paa_r101_fpn_2x_coco.py
new file mode 100755
index 0000000..641ef76
--- /dev/null
+++ b/configs/paa/paa_r101_fpn_2x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './paa_r101_fpn_1x_coco.py'
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/paa/paa_r101_fpn_mstrain_3x_coco.py b/configs/paa/paa_r101_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..71858ed
--- /dev/null
+++ b/configs/paa/paa_r101_fpn_mstrain_3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './paa_r50_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/paa/paa_r50_fpn_1.5x_coco.py b/configs/paa/paa_r50_fpn_1.5x_coco.py
new file mode 100755
index 0000000..aabce4a
--- /dev/null
+++ b/configs/paa/paa_r50_fpn_1.5x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+lr_config = dict(step=[12, 16])
+runner = dict(type='EpochBasedRunner', max_epochs=18)
diff --git a/configs/paa/paa_r50_fpn_1x_coco.py b/configs/paa/paa_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..4c9c4aa
--- /dev/null
+++ b/configs/paa/paa_r50_fpn_1x_coco.py
@@ -0,0 +1,70 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='PAA',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='PAAHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.1,
+            neg_iou_thr=0.1,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/paa/paa_r50_fpn_2x_coco.py b/configs/paa/paa_r50_fpn_2x_coco.py
new file mode 100755
index 0000000..663d2c0
--- /dev/null
+++ b/configs/paa/paa_r50_fpn_2x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/paa/paa_r50_fpn_mstrain_3x_coco.py b/configs/paa/paa_r50_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..91fa28c
--- /dev/null
+++ b/configs/paa/paa_r50_fpn_mstrain_3x_coco.py
@@ -0,0 +1,20 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/pafpn/README.md b/configs/pafpn/README.md
new file mode 100755
index 0000000..ae1e3a3
--- /dev/null
+++ b/configs/pafpn/README.md
@@ -0,0 +1,34 @@
+# PAFPN
+
+> [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The way that information propagates in neural networks is of great importance. In this paper, we propose Path Aggregation Network (PANet) aiming at boosting information flow in proposal-based instance segmentation framework. Specifically, we enhance the entire feature hierarchy with accurate localization signals in lower layers by bottom-up path augmentation, which shortens the information path between lower layers and topmost feature. We present adaptive feature pooling, which links feature grid and all feature levels to make useful information in each feature level propagate directly to following proposal subnetworks. A complementary branch capturing different views for each proposal is created to further improve mask prediction. These improvements are simple to implement, with subtle extra computational overhead. Our PANet reaches the 1st place in the COCO 2017 Challenge Instance Segmentation task and the 2nd place in Object Detection task without large-batch training. It is also state-of-the-art on MVD and Cityscapes.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143968947-5f2d7e8a-a236-4d59-8f2d-7fbb12764845.png"/>
+</div>
+
+## Results and Models
+
+| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                     Config                                                     |                                                                                                                                                          Download                                                                                                                                                           |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN | pytorch |   1x    |   4.0    |      17.2      |  37.5  |         | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_20200503_105836.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{liu2018path,
+  author = {Shu Liu and
+            Lu Qi and
+            Haifang Qin and
+            Jianping Shi and
+            Jiaya Jia},
+  title = {Path Aggregation Network for Instance Segmentation},
+  booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2018}
+}
+```
diff --git a/configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py b/configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py
new file mode 100755
index 0000000..b2fdef9
--- /dev/null
+++ b/configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    neck=dict(
+        type='PAFPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/pafpn/metafile.yml b/configs/pafpn/metafile.yml
new file mode 100755
index 0000000..f9cf97c
--- /dev/null
+++ b/configs/pafpn/metafile.yml
@@ -0,0 +1,38 @@
+Collections:
+  - Name: PAFPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - PAFPN
+    Paper:
+      URL: https://arxiv.org/abs/1803.01534
+      Title: 'Path Aggregation Network for Instance Segmentation'
+    README: configs/pafpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/pafpn.py#L11
+      Version: v2.0.0
+
+Models:
+  - Name: faster_rcnn_r50_pafpn_1x_coco
+    In Collection: PAFPN
+    Config: configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 58.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth
diff --git a/configs/panoptic_fpn/README.md b/configs/panoptic_fpn/README.md
new file mode 100755
index 0000000..b31c9c0
--- /dev/null
+++ b/configs/panoptic_fpn/README.md
@@ -0,0 +1,63 @@
+# Panoptic FPN
+
+> [Panoptic feature pyramid networks](https://arxiv.org/abs/1901.02446)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The recently introduced panoptic segmentation task has renewed our community's interest in unifying the tasks of instance segmentation (for thing classes) and semantic segmentation (for stuff classes). However, current state-of-the-art methods for this joint task use separate and dissimilar networks for instance and semantic segmentation, without performing any shared computation. In this work, we aim to unify these methods at the architectural level, designing a single network for both tasks. Our approach is to endow Mask R-CNN, a popular instance segmentation method, with a semantic segmentation branch using a shared Feature Pyramid Network (FPN) backbone. Surprisingly, this simple baseline not only remains effective for instance segmentation, but also yields a lightweight, top-performing method for semantic segmentation. In this work, we perform a detailed study of this minimally extended version of Mask R-CNN with FPN, which we refer to as Panoptic FPN, and show it is a robust and accurate baseline for both tasks. Given its effectiveness and conceptual simplicity, we hope our method can serve as a strong baseline and aid future research in panoptic segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143968979-a1593758-c9d7-44a6-a3b8-d9686ef19ce8.png" height="300"/>
+</div>
+
+## Dataset
+
+PanopticFPN requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path.
+The directory should be like this.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── panoptic_train2017.json
+│   │   │   ├── panoptic_train2017
+│   │   │   ├── panoptic_val2017.json
+│   │   │   ├── panoptic_val2017
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+```
+
+## Results and Models
+
+| Backbone  |  style  | Lr schd | Mem (GB) | Inf time (fps) |  PQ  |  SQ  |  RQ  | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st |                                                            Config                                                             |                                                                                                                                                                          Download                                                                                                                                                                          |
+| :-------: | :-----: | :-----: | :------: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :---: | :---: | :---: | :---------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN  | pytorch |   1x    |   4.7    |                | 40.2 | 77.8 | 49.3 | 47.8  | 80.9  | 57.5  | 28.9  | 73.1  | 37.0  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153.log.json)                   |
+| R-50-FPN  | pytorch |   3x    |    -     |       -        | 42.5 | 78.1 | 51.7 | 50.3  | 81.5  | 60.3  | 30.7  | 73.0  | 38.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155-5650f98b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155.log.json)   |
+| R-101-FPN | pytorch |   1x    |   6.7    |                | 42.2 | 78.3 | 51.4 | 50.1  | 81.4  | 59.9  | 30.3  | 73.6  | 38.5  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950.log.json)                 |
+| R-101-FPN | pytorch |   3x    |    -     |       -        | 44.1 | 78.9 | 53.6 | 52.1  | 81.7  | 62.3  | 32.0  | 74.6  | 40.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712-9c99acc4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712.log.json) |
+| R2-50-FPN | pytorch |   1x    |    -     |       -        | 42.5 | 78.0 | 51.8 | 50.0  | 81.4  | 60.0  | 31.1  | 72.8  | 39.4  |   [config](https://github.com/open-mmlab/mmdetection/tree/dev/configs/panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py)    |                   [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/panoptic_fpn_r2_50_fpn_fp16_1x_coco/panoptic_fpn_r2_50_fpn_fp16_1x_coco-fa6c51f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/panoptic_fpn_r2_50_fpn_fp16_1x_coco/panoptic_fpn_r2_50_fpn_fp16_1x_coco_20221114_224729.log.json)                   |
+
+## Citation
+
+The base method for panoptic segmentation task.
+
+```latex
+@inproceedings{kirillov2018panopticfpn,
+  author = {
+    Alexander Kirillov,
+    Ross Girshick,
+    Kaiming He,
+    Piotr Dollar,
+  },
+  title = {Panoptic Feature Pyramid Networks},
+  booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2019}
+}
+```
diff --git a/configs/panoptic_fpn/metafile.yml b/configs/panoptic_fpn/metafile.yml
new file mode 100755
index 0000000..c258c8e
--- /dev/null
+++ b/configs/panoptic_fpn/metafile.yml
@@ -0,0 +1,83 @@
+Collections:
+  - Name: PanopticFPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - PanopticFPN
+    Paper:
+      URL: https://arxiv.org/pdf/1901.02446
+      Title: 'Panoptic feature pyramid networks'
+    README: configs/panoptic_fpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/detectors/panoptic_fpn.py#L7
+      Version: v2.16.0
+
+Models:
+  - Name: panoptic_fpn_r50_fpn_1x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 12
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth
+
+  - Name: panoptic_fpn_r50_fpn_mstrain_3x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 36
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155-5650f98b.pth
+
+  - Name: panoptic_fpn_r101_fpn_1x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      Epochs: 12
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 42.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth
+
+  - Name: panoptic_fpn_r101_fpn_mstrain_3x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      Epochs: 36
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 44.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712-9c99acc4.pth
+
+  - Name: panoptic_fpn_r2_50_fpn_fp16_1x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      Epochs: 12
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/panoptic_fpn_r2_50_fpn_fp16_1x_coco/panoptic_fpn_r2_50_fpn_fp16_1x_coco-fa6c51f0.pth
diff --git a/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py b/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..78b8079
--- /dev/null
+++ b/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py b/configs/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..057e481
--- /dev/null
+++ b/configs/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './panoptic_fpn_r50_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py b/configs/panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
new file mode 100755
index 0000000..6c75f01
--- /dev/null
+++ b/configs/panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=50,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='res2net50_v1b_26w_4s-3cf99910.pth')))
+
+fp16 = dict(loss_scale='dynamic')
diff --git a/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py b/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..2995524
--- /dev/null
+++ b/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
@@ -0,0 +1,33 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_panoptic.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='PanopticFPN',
+    semantic_head=dict(
+        type='PanopticFPNHead',
+        num_things_classes=80,
+        num_stuff_classes=53,
+        in_channels=256,
+        inner_channels=128,
+        start_level=0,
+        end_level=4,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        conv_cfg=None,
+        loss_seg=dict(
+            type='CrossEntropyLoss', ignore_index=255, loss_weight=0.5)),
+    panoptic_fusion_head=dict(
+        type='HeuristicFusionHead',
+        num_things_classes=80,
+        num_stuff_classes=53),
+    test_cfg=dict(
+        panoptic=dict(
+            score_thr=0.6,
+            max_per_img=100,
+            mask_thr_binary=0.5,
+            mask_overlap=0.5,
+            nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
+            stuff_area_limit=4096)))
+
+custom_hooks = []
diff --git a/configs/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py b/configs/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..b510935
--- /dev/null
+++ b/configs/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco.py
@@ -0,0 +1,61 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_coco.py'
+
+# dataset settings
+dataset_type = 'CocoPanopticDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 4),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/panoptic_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            seg_prefix=data_root + 'annotations/panoptic_train2017/',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/pascal_voc/README.md b/configs/pascal_voc/README.md
new file mode 100755
index 0000000..3c09813
--- /dev/null
+++ b/configs/pascal_voc/README.md
@@ -0,0 +1,40 @@
+# Pascal VOC
+
+> [The Pascal Visual Object Classes (VOC) Challenge](https://link.springer.com/article/10.1007/s11263-009-0275-4)
+
+<!-- [DATASET] -->
+
+## Abstract
+
+The Pascal Visual Object Classes (VOC) challenge is a benchmark in visual object category recognition and detection, providing the vision and machine learning communities with a standard dataset of images and annotation, and standard evaluation procedures. Organised annually from 2005 to present, the challenge and its associated dataset has become accepted as the benchmark for object detection.
+
+This paper describes the dataset and evaluation procedure. We review the state-of-the-art in evaluated methods for both classification and detection, analyse whether the methods are statistically different, what they are learning from the images (e.g. the object or its context), and what the methods find easy or confuse. The paper concludes with lessons learnt in the three year history of the challenge, and proposes directions for future improvement and extension.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143969235-6bb4d665-0470-4bae-825c-492eb4582127.png" height="600"/>
+</div>
+
+## Results and Models
+
+|  Architecture   | Backbone |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                               Config                                                               |                                                                                                                                                                                                                                                       Download                                                                                                                                                                                                                                                       |
+| :-------------: | :------: | :-----: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Faster R-CNN C4 |   R-50   |  caffe  |   18k   |          |       -        |  80.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712//home/dong/code_sensetime/2022Q1/mmdetection/work_dirs/prepare_voc/gather/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712_20220314_234327-847a14d2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712_20220314_234327.log.json) |
+|  Faster R-CNN   |   R-50   | pytorch |   1x    |   2.6    |       -        |  80.4  |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py)        |                                                                                              [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20220320_192712-54bef0f3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20220320_192712.log.json)                                                                                              |
+|    Retinanet    |   R-50   | pytorch |   1x    |   2.1    |       -        |  77.3  |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py)         |                                                                                                     [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200617-47cbdd0e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200616_014642.log.json)                                                                                                      |
+|     SSD300      |  VGG16   |    -    |  120e   |    -     |       -        |  76.5  |                [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/ssd300_voc0712.py)                |                                                                                                                              [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658-17edda1b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658.log.json)                                                                                                                              |
+|     SSD512      |  VGG16   |    -    |  120e   |    -     |       -        |  79.5  |                [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc/ssd512_voc0712.py)                |                                                                                                                              [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717-03cefefe.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717.log.json)                                                                                                                              |
+
+## Citation
+
+```latex
+@Article{Everingham10,
+   author = "Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.",
+   title = "The Pascal Visual Object Classes (VOC) Challenge",
+   journal = "International Journal of Computer Vision",
+   volume = "88",
+   year = "2010",
+   number = "2",
+   month = jun,
+   pages = "303--338",
+}
+```
diff --git a/configs/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py b/configs/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py
new file mode 100755
index 0000000..7bb1d73
--- /dev/null
+++ b/configs/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712.py
@@ -0,0 +1,81 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_caffe_c4.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+
+# dataset settings
+dataset_type = 'VOCDataset'
+data_root = 'data/VOCdevkit/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 480), (1333, 512), (1333, 544), (1333, 576),
+                   (1333, 608), (1333, 640), (1333, 672), (1333, 704),
+                   (1333, 736), (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=[
+            data_root + 'VOC2007/ImageSets/Main/trainval.txt',
+            data_root + 'VOC2012/ImageSets/Main/trainval.txt'
+        ],
+        img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'VOC2007/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'VOC2007/',
+        pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=100,
+    warmup_ratio=0.001,
+    step=[12000, 16000])
+
+# Runner type
+runner = dict(type='IterBasedRunner', max_iters=18000)
+
+checkpoint_config = dict(interval=3000)
+evaluation = dict(interval=3000, metric='mAP')
diff --git a/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py b/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py
new file mode 100755
index 0000000..7866ace
--- /dev/null
+++ b/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+# actual epoch = 3 * 3 = 9
+lr_config = dict(policy='step', step=[3])
+# runtime settings
+runner = dict(
+    type='EpochBasedRunner', max_epochs=4)  # actual epoch = 4 * 3 = 12
diff --git a/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712_cocofmt.py b/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712_cocofmt.py
new file mode 100755
index 0000000..12eee2c
--- /dev/null
+++ b/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712_cocofmt.py
@@ -0,0 +1,75 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+
+CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+           'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
+           'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor')
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/VOCdevkit/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1000, 600), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1000, 600),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file='data/voc0712_trainval.json',
+            img_prefix='data/VOCdevkit',
+            pipeline=train_pipeline,
+            classes=CLASSES)),
+    val=dict(
+        type=dataset_type,
+        ann_file='data/voc07_test.json',
+        img_prefix='data/VOCdevkit',
+        pipeline=test_pipeline,
+        classes=CLASSES),
+    test=dict(
+        type=dataset_type,
+        ann_file='data/voc07_test.json',
+        img_prefix='data/VOCdevkit',
+        pipeline=test_pipeline,
+        classes=CLASSES))
+evaluation = dict(interval=1, metric='bbox')
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+# actual epoch = 3 * 3 = 9
+lr_config = dict(policy='step', step=[3])
+# runtime settings
+runner = dict(
+    type='EpochBasedRunner', max_epochs=4)  # actual epoch = 4 * 3 = 12
diff --git a/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py b/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py
new file mode 100755
index 0000000..b4b050d
--- /dev/null
+++ b/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(bbox_head=dict(num_classes=20))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+# actual epoch = 3 * 3 = 9
+lr_config = dict(policy='step', step=[3])
+# runtime settings
+runner = dict(
+    type='EpochBasedRunner', max_epochs=4)  # actual epoch = 4 * 3 = 12
diff --git a/configs/pascal_voc/ssd300_voc0712.py b/configs/pascal_voc/ssd300_voc0712.py
new file mode 100755
index 0000000..e7008ae
--- /dev/null
+++ b/configs/pascal_voc/ssd300_voc0712.py
@@ -0,0 +1,74 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    bbox_head=dict(
+        num_classes=20, anchor_generator=dict(basesize_ratio_range=(0.2,
+                                                                    0.9))))
+# dataset settings
+dataset_type = 'VOCDataset'
+data_root = 'data/VOCdevkit/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        type='RepeatDataset', times=10, dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[16, 20])
+checkpoint_config = dict(interval=1)
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/pascal_voc/ssd512_voc0712.py b/configs/pascal_voc/ssd512_voc0712.py
new file mode 100755
index 0000000..f4627c2
--- /dev/null
+++ b/configs/pascal_voc/ssd512_voc0712.py
@@ -0,0 +1,57 @@
+_base_ = 'ssd300_voc0712.py'
+input_size = 512
+model = dict(
+    neck=dict(
+        out_channels=(512, 1024, 512, 256, 256, 256, 256),
+        level_strides=(2, 2, 2, 2, 1),
+        level_paddings=(1, 1, 1, 1, 1),
+        last_kernel_size=4),
+    bbox_head=dict(
+        in_channels=(512, 1024, 512, 256, 256, 256, 256),
+        anchor_generator=dict(
+            input_size=input_size,
+            strides=[8, 16, 32, 64, 128, 256, 512],
+            basesize_ratio_range=(0.15, 0.9),
+            ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]))))
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(512, 512), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(512, 512),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/pisa/README.md b/configs/pisa/README.md
new file mode 100755
index 0000000..c847c85
--- /dev/null
+++ b/configs/pisa/README.md
@@ -0,0 +1,50 @@
+# PISA
+
+> [Prime Sample Attention in Object Detection](https://arxiv.org/abs/1904.04821)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+It is a common paradigm in object detection frameworks to treat all samples equally and target at maximizing the performance on average. In this work, we revisit this paradigm through a careful study on how different samples contribute to the overall performance measured in terms of mAP. Our study suggests that the samples in each mini-batch are neither independent nor equally important, and therefore a better classifier on average does not necessarily mean higher mAP. Motivated by this study, we propose the notion of Prime Samples, those that play a key role in driving the detection performance. We further develop a simple yet effective sampling and learning strategy called PrIme Sample Attention (PISA) that directs the focus of the training process towards such samples. Our experiments demonstrate that it is often more effective to focus on prime samples than hard samples when training a detector. Particularly, On the MSCOCO dataset, PISA outperforms the random sampling baseline and hard mining schemes, e.g., OHEM and Focal Loss, consistently by around 2% on both single-stage and two-stage detectors, even with a strong backbone ResNeXt-101.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143970710-5cfd5960-fcf9-4e32-860a-acd46ce5d274.png"/>
+</div>
+
+## Results and Models
+
+| PISA |   Network    |    Backbone    | Lr schd | box AP | mask AP |                                                         Config                                                          |                                                                                                                                                              Download                                                                                                                                                              |
+| :--: | :----------: | :------------: | :-----: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  ×   | Faster R-CNN |    R-50-FPN    |   1x    |  36.4  |         |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   | Faster R-CNN |    R-50-FPN    |   1x    |  38.4  |         |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco_20200506_185619.log.json)               |
+|  ×   | Faster R-CNN | X101-32x4d-FPN |   1x    |  40.1  |         |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   | Faster R-CNN | X101-32x4d-FPN |   1x    |  41.9  |         | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco-e4accec4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco_20200505_181503.log.json) |
+|  ×   | Mask   R-CNN |    R-50-FPN    |   1x    |  37.3  |  34.2   |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   | Mask   R-CNN |    R-50-FPN    |   1x    |  39.1  |  35.2   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco-dfcedba6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco_20200508_150500.log.json)                   |
+|  ×   | Mask   R-CNN | X101-32x4d-FPN |   1x    |  41.1  |  37.1   |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   | Mask   R-CNN | X101-32x4d-FPN |   1x    |        |         |                                                                                                                         |                                                                                                                                                                                                                                                                                                                                    |
+|  ×   |  RetinaNet   |    R-50-FPN    |   1x    |  35.6  |         |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   |  RetinaNet   |    R-50-FPN    |   1x    |  36.9  |         |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_retinanet_r50_fpn_1x_coco.py)      |                   [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco-76409952.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco_20200504_014311.log.json)                   |
+|  ×   |  RetinaNet   | X101-32x4d-FPN |   1x    |  39.0  |         |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   |  RetinaNet   | X101-32x4d-FPN |   1x    |  40.7  |         |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py)  |     [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco-a0c13c73.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco_20200505_001404.log.json)     |
+|  ×   |    SSD300    |     VGG16      |   1x    |  25.6  |         |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   |    SSD300    |     VGG16      |   1x    |  27.6  |         |            [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_ssd300_coco.py)             |                                               [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco-710e3ac9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco_20200504_144325.log.json)                                               |
+|  ×   |    SSD512    |     VGG16      |   1x    |  29.3  |         |                                                            -                                                            |                                                                                                                                                                                                                                                                                                                                    |
+|  √   |    SSD512    |     VGG16      |   1x    |  31.8  |         |            [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pisa/pisa_ssd512_coco.py)             |                                               [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco-247addee.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco_20200508_131030.log.json)                                               |
+
+**Notes:**
+
+- In the original paper, all models are trained and tested on mmdet v1.x, thus results may not be exactly the same with this release on v2.0.
+- It is noted PISA only modifies the training pipeline so the inference time remains the same with the baseline.
+
+## Citation
+
+```latex
+@inproceedings{cao2019prime,
+  title={Prime sample attention in object detection},
+  author={Cao, Yuhang and Chen, Kai and Loy, Chen Change and Lin, Dahua},
+  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+  year={2020}
+}
+```
diff --git a/configs/pisa/metafile.yml b/configs/pisa/metafile.yml
new file mode 100755
index 0000000..cd43afb
--- /dev/null
+++ b/configs/pisa/metafile.yml
@@ -0,0 +1,110 @@
+Collections:
+  - Name: PISA
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - PISA
+        - RPN
+        - ResNet
+        - RoIPool
+    Paper:
+      URL: https://arxiv.org/abs/1904.04821
+      Title: 'Prime Sample Attention in Object Detection'
+    README: configs/pisa/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/pisa_roi_head.py#L8
+      Version: v2.1.0
+
+Models:
+  - Name: pisa_faster_rcnn_r50_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth
+
+  - Name: pisa_faster_rcnn_x101_32x4d_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco-e4accec4.pth
+
+  - Name: pisa_mask_rcnn_r50_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco-dfcedba6.pth
+
+  - Name: pisa_retinanet_r50_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/pisa_retinanet_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco-76409952.pth
+
+  - Name: pisa_retinanet_x101_32x4d_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco-a0c13c73.pth
+
+  - Name: pisa_ssd300_coco
+    In Collection: PISA
+    Config: configs/pisa/pisa_ssd300_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 27.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco-710e3ac9.pth
+
+  - Name: pisa_ssd512_coco
+    In Collection: PISA
+    Config: configs/pisa/pisa_ssd512_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 31.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco-247addee.pth
diff --git a/configs/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py b/configs/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..71e65b0
--- /dev/null
+++ b/configs/pisa/pisa_faster_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/configs/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..16edd99
--- /dev/null
+++ b/configs/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py b/configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..047a293
--- /dev/null
+++ b/configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/configs/pisa/pisa_mask_rcnn_x101_32x4d_fpn_1x_coco.py b/configs/pisa/pisa_mask_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..2186a8f
--- /dev/null
+++ b/configs/pisa/pisa_mask_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/configs/pisa/pisa_retinanet_r50_fpn_1x_coco.py b/configs/pisa/pisa_retinanet_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..70f89e2
--- /dev/null
+++ b/configs/pisa/pisa_retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+
+model = dict(
+    bbox_head=dict(
+        type='PISARetinaHead',
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
diff --git a/configs/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py b/configs/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..b97b672
--- /dev/null
+++ b/configs/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../retinanet/retinanet_x101_32x4d_fpn_1x_coco.py'
+
+model = dict(
+    bbox_head=dict(
+        type='PISARetinaHead',
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
diff --git a/configs/pisa/pisa_ssd300_coco.py b/configs/pisa/pisa_ssd300_coco.py
new file mode 100755
index 0000000..b5cc006
--- /dev/null
+++ b/configs/pisa/pisa_ssd300_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../ssd/ssd300_coco.py'
+
+model = dict(
+    bbox_head=dict(type='PISASSDHead'),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
+
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/pisa/pisa_ssd512_coco.py b/configs/pisa/pisa_ssd512_coco.py
new file mode 100755
index 0000000..3219d6d
--- /dev/null
+++ b/configs/pisa/pisa_ssd512_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../ssd/ssd512_coco.py'
+
+model = dict(
+    bbox_head=dict(type='PISASSDHead'),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
+
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/point_rend/README.md b/configs/point_rend/README.md
new file mode 100755
index 0000000..183e83d
--- /dev/null
+++ b/configs/point_rend/README.md
@@ -0,0 +1,33 @@
+# PointRend
+
+> [PointRend: Image Segmentation as Rendering](https://arxiv.org/abs/1912.08193)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a new method for efficient high-quality image segmentation of objects and scenes. By analogizing classical computer graphics methods for efficient rendering with over- and undersampling challenges faced in pixel labeling tasks, we develop a unique perspective of image segmentation as a rendering problem. From this vantage, we present the PointRend (Point-based Rendering) neural network module: a module that performs point-based segmentation predictions at adaptively selected locations based on an iterative subdivision algorithm. PointRend can be flexibly applied to both instance and semantic segmentation tasks by building on top of existing state-of-the-art models. While many concrete implementations of the general idea are possible, we show that a simple design already achieves excellent results. Qualitatively, PointRend outputs crisp object boundaries in regions that are over-smoothed by previous methods. Quantitatively, PointRend yields significant gains on COCO and Cityscapes, for both instance and semantic segmentation. PointRend's efficiency enables output resolutions that are otherwise impractical in terms of memory or computation compared to existing approaches.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143970097-d38b6801-d3c8-468f-b8b0-639be3689907.png"/>
+</div>
+
+## Results and Models
+
+| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                             Config                                                             |                                                                                                                                                                      Download                                                                                                                                                                      |
+| :------: | :---: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-FPN | caffe |   1x    |   4.6    |                |  38.4  |  36.3   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco_20200612_161407.log.json) |
+| R-50-FPN | caffe |   3x    |   4.6    |                |  41.0  |  38.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco-e0ebb6b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco_20200614_002632.log.json) |
+
+Note: All models are trained with multi-scale, the input image shorter side is randomly scaled to one of (640, 672, 704, 736, 768, 800).
+
+## Citation
+
+```latex
+@InProceedings{kirillov2019pointrend,
+  title={{PointRend}: Image Segmentation as Rendering},
+  author={Alexander Kirillov and Yuxin Wu and Kaiming He and Ross Girshick},
+  journal={ArXiv:1912.08193},
+  year={2019}
+}
+```
diff --git a/configs/point_rend/metafile.yml b/configs/point_rend/metafile.yml
new file mode 100755
index 0000000..82aea05
--- /dev/null
+++ b/configs/point_rend/metafile.yml
@@ -0,0 +1,54 @@
+Collections:
+  - Name: PointRend
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - PointRend
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1912.08193
+      Title: 'PointRend: Image Segmentation as Rendering'
+    README: configs/point_rend/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/point_rend.py#L6
+      Version: v2.2.0
+
+Models:
+  - Name: point_rend_r50_caffe_fpn_mstrain_1x_coco
+    In Collection: PointRend
+    Config: configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth
+
+  - Name: point_rend_r50_caffe_fpn_mstrain_3x_coco
+    In Collection: PointRend
+    Config: configs/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco-e0ebb6b7.pth
diff --git a/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py b/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py
new file mode 100755
index 0000000..0c0e563
--- /dev/null
+++ b/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py
@@ -0,0 +1,44 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py'
+# model settings
+model = dict(
+    type='PointRend',
+    roi_head=dict(
+        type='PointRendRoIHead',
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='concat',
+            roi_layer=dict(
+                _delete_=True, type='SimpleRoIAlign', output_size=14),
+            out_channels=256,
+            featmap_strides=[4]),
+        mask_head=dict(
+            _delete_=True,
+            type='CoarseMaskHead',
+            num_fcs=2,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+        point_head=dict(
+            type='MaskPointHead',
+            num_fcs=3,
+            in_channels=256,
+            fc_channels=256,
+            num_classes=80,
+            coarse_pred_each_layer=True,
+            loss_point=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            mask_size=7,
+            num_points=14 * 14,
+            oversample_ratio=3,
+            importance_sample_ratio=0.75)),
+    test_cfg=dict(
+        rcnn=dict(
+            subdivision_steps=5,
+            subdivision_num_points=28 * 28,
+            scale_factor=2)))
diff --git a/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py b/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..169278e
--- /dev/null
+++ b/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './point_rend_r50_caffe_fpn_mstrain_1x_coco.py'
+# learning policy
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/pvt/README.md b/configs/pvt/README.md
new file mode 100755
index 0000000..1fd090b
--- /dev/null
+++ b/configs/pvt/README.md
@@ -0,0 +1,57 @@
+# PVT
+
+> [Pyramid vision transformer: A versatile backbone for dense prediction without convolutions](https://arxiv.org/abs/2102.12122)
+
+<!-- [BACKBONE] -->
+
+## Abstract
+
+Although using convolutional neural networks (CNNs) as backbones achieves great successes in computer vision, this work investigates a simple backbone network useful for many dense prediction tasks without convolutions. Unlike the recently-proposed Transformer model (e.g., ViT) that is specially designed for image classification, we propose Pyramid Vision Transformer~(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several merits compared to prior arts. (1) Different from ViT that typically has low-resolution outputs and high computational and memory cost, PVT can be not only trained on dense partitions of the image to achieve high output resolution, which is important for dense predictions but also using a progressive shrinking pyramid to reduce computations of large feature maps. (2) PVT inherits the advantages from both CNN and Transformer, making it a unified backbone in various vision tasks without convolutions by simply replacing CNN backbones. (3) We validate PVT by conducting extensive experiments, showing that it boosts the performance of many downstream tasks, e.g., object detection, semantic, and instance segmentation. For example, with a comparable number of parameters, RetinaNet+PVT achieves 40.4 AP on the COCO dataset, surpassing RetinNet+ResNet50 (36.3 AP) by 4.1 absolute AP. We hope PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future researches.
+
+Transformer recently has shown encouraging progresses in computer vision. In this work, we present new baselines by improving the original Pyramid Vision Transformer (abbreviated as PVTv1) by adding three designs, including (1) overlapping patch embedding, (2) convolutional feed-forward networks, and (3) linear complexity attention layers.
+With these modifications, our PVTv2 significantly improves PVTv1 on three tasks e.g., classification, detection, and segmentation. Moreover, PVTv2 achieves comparable or better performances than recent works such as Swin Transformer. We hope this work will facilitate state-of-the-art Transformer researches in computer vision.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143969989-6f94e695-23b1-4f8f-b406-d589fdc3cfb2.png"/>
+</div>
+
+## Results and Models
+
+### RetinaNet (PVTv1)
+
+|  Backbone  | Lr schd | Mem (GB) | box AP |                                                   Config                                                   |                                                                                                                                             Download                                                                                                                                             |
+| :--------: | :-----: | :------: | :----: | :--------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  PVT-Tiny  |   12e   |   8.5    |  36.6  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_t_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110-17b566bd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110.log.json) |
+| PVT-Small  |   12e   |   14.5   |  40.4  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_s_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921.log.json) |
+| PVT-Medium |   12e   |   20.9   |  41.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_m_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243-55effa1b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243.log.json) |
+
+### RetinaNet (PVTv2)
+
+| Backbone | Lr schd | Mem (GB) | box AP |                                                     Config                                                     |                                                                                                                                                   Download                                                                                                                                                   |
+| :------: | :-----: | :------: | :----: | :------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| PVTv2-B0 |   12e   |   7.4    |  37.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b0_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157-13e9aabe.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157.log.json) |
+| PVTv2-B1 |   12e   |   9.5    |  41.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b1_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318-7e169a7d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318.log.json) |
+| PVTv2-B2 |   12e   |   16.2   |  44.6  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b2_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843-529f0b9a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843.log.json) |
+| PVTv2-B3 |   12e   |   23.0   |  46.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b3_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512-8357deff.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512.log.json) |
+| PVTv2-B4 |   12e   |   17.0   |  46.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b4_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151-83795c86.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151.log.json) |
+| PVTv2-B5 |   12e   |   18.7   |  46.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/pvt/retinanet_pvt_v2_b5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800-3420eb57.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800.log.json) |
+
+## Citation
+
+```latex
+@article{wang2021pyramid,
+  title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions},
+  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
+  journal={arXiv preprint arXiv:2102.12122},
+  year={2021}
+}
+```
+
+```latex
+@article{wang2021pvtv2,
+  title={PVTv2: Improved Baselines with Pyramid Vision Transformer},
+  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
+  journal={arXiv preprint arXiv:2106.13797},
+  year={2021}
+}
+```
diff --git a/configs/pvt/metafile.yml b/configs/pvt/metafile.yml
new file mode 100755
index 0000000..5884378
--- /dev/null
+++ b/configs/pvt/metafile.yml
@@ -0,0 +1,243 @@
+Models:
+  - Name: retinanet_pvt-t_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvt-t_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110-17b566bd.pth
+    Paper:
+      URL: https://arxiv.org/abs/2102.12122
+      Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315
+      Version: 2.17.0
+
+  - Name: retinanet_pvt-s_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvt-s_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 14.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2102.12122
+      Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315
+      Version: 2.17.0
+
+  - Name: retinanet_pvt-m_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvt-m_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 20.9
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243-55effa1b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2102.12122
+      Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b0_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157-13e9aabe.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b1_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318-7e169a7d.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b2_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 16.2
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843-529f0b9a.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b3_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 23.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512-8357deff.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b4_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 17.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151-83795c86.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b5_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 18.7
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800-3420eb57.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
diff --git a/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py b/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py
new file mode 100755
index 0000000..e299f2a
--- /dev/null
+++ b/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'retinanet_pvt-t_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        num_layers=[3, 8, 27, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_large.pth')))
+fp16 = dict(loss_scale=dict(init_scale=512))
diff --git a/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py b/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py
new file mode 100755
index 0000000..b888f78
--- /dev/null
+++ b/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = 'retinanet_pvt-t_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        num_layers=[3, 4, 18, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_medium.pth')))
diff --git a/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py b/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py
new file mode 100755
index 0000000..4660348
--- /dev/null
+++ b/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = 'retinanet_pvt-t_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        num_layers=[3, 4, 6, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_small.pth')))
diff --git a/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py b/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py
new file mode 100755
index 0000000..a6cff7d
--- /dev/null
+++ b/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        _delete_=True,
+        type='PyramidVisionTransformer',
+        num_layers=[2, 2, 2, 2],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_tiny.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
+# optimizer
+optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001)
diff --git a/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py b/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py
new file mode 100755
index 0000000..cbe2295
--- /dev/null
+++ b/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        _delete_=True,
+        type='PyramidVisionTransformerV2',
+        embed_dims=32,
+        num_layers=[2, 2, 2, 2],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b0.pth')),
+    neck=dict(in_channels=[32, 64, 160, 256]))
+# optimizer
+optimizer = dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001)
diff --git a/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py b/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py
new file mode 100755
index 0000000..5374c50
--- /dev/null
+++ b/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b1.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py b/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py
new file mode 100755
index 0000000..cf9a18d
--- /dev/null
+++ b/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 4, 6, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b2.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py b/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py
new file mode 100755
index 0000000..7a47f82
--- /dev/null
+++ b/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 4, 18, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b3.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py b/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py
new file mode 100755
index 0000000..9891d7b
--- /dev/null
+++ b/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 8, 27, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b4.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
+# optimizer
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001)
+# dataset settings
+data = dict(samples_per_gpu=1, workers_per_gpu=1)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py b/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py
new file mode 100755
index 0000000..a9fea2e
--- /dev/null
+++ b/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 6, 40, 3],
+        mlp_ratios=(4, 4, 4, 4),
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b5.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
+# optimizer
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001)
+# dataset settings
+data = dict(samples_per_gpu=1, workers_per_gpu=1)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/queryinst/README.md b/configs/queryinst/README.md
new file mode 100755
index 0000000..ad6e0b3
--- /dev/null
+++ b/configs/queryinst/README.md
@@ -0,0 +1,36 @@
+# QueryInst
+
+> [Instances as Queries](https://openaccess.thecvf.com/content/ICCV2021/html/Fang_Instances_As_Queries_ICCV_2021_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present QueryInst, a new perspective for instance segmentation. QueryInst is a multi-stage end-to-end system that treats instances of interest as learnable queries, enabling query based object detectors, e.g., Sparse R-CNN, to have strong instance segmentation performance. The attributes of instances such as categories, bounding boxes, instance masks, and instance association embeddings are represented by queries in a unified manner. In QueryInst, a query is shared by both detection and segmentation via dynamic convolutions and driven by parallelly-supervised multi-stage learning. We conduct extensive experiments on three challenging benchmarks, i.e., COCO, CityScapes, and YouTube-VIS to evaluate the effectiveness of QueryInst in object detection, instance segmentation, and video instance segmentation tasks. For the first time, we demonstrate that a simple end-to-end query based framework can achieve the state-of-the-art performance in various instance-level recognition tasks.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143971527-c1b7ff78-e95f-4edb-9d5e-3d6d7d902999.png"/>
+</div>
+
+## Results and Models
+
+|   Model   | Backbone  |  Style  | Lr schd | Number of Proposals | Multi-Scale | RandomCrop | box AP | mask AP |                                                                       Config                                                                       |                                                                                                                                                                                                                       Download                                                                                                                                                                                                                       |
+| :-------: | :-------: | :-----: | :-----: | :-----------------: | :---------: | :--------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| QueryInst | R-50-FPN  | pytorch |   1x    |         100         |    False    |   False    |  42.0  |  37.5   |                   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r50_fpn_1x_coco.py)                   |                                                                         [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916.log.json)                                                                         |
+| QueryInst | R-50-FPN  | pytorch |   3x    |         100         |    True     |   False    |  44.8  |  39.8   |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py)           |                                         [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643-7837af86.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643.log.json)                                         |
+| QueryInst | R-50-FPN  | pytorch |   3x    |         300         |    True     |    True    |  47.5  |  41.7   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802-85cffbd8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802.log.json)   |
+| QueryInst | R-101-FPN | pytorch |   3x    |         100         |    True     |   False    |  46.4  |  41.0   |          [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py)           |                                       [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048-91f9995b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048.log.json)                                       |
+| QueryInst | R-101-FPN | pytorch |   3x    |         300         |    True     |    True    |  49.0  |  42.9   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621-76cce59f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621.log.json) |
+
+## Citation
+
+```latex
+@InProceedings{Fang_2021_ICCV,
+    author    = {Fang, Yuxin and Yang, Shusheng and Wang, Xinggang and Li, Yu and Fang, Chen and Shan, Ying and Feng, Bin and Liu, Wenyu},
+    title     = {Instances As Queries},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    month     = {October},
+    year      = {2021},
+    pages     = {6910-6919}
+}
+```
diff --git a/configs/queryinst/metafile.yml b/configs/queryinst/metafile.yml
new file mode 100755
index 0000000..da7f0a7
--- /dev/null
+++ b/configs/queryinst/metafile.yml
@@ -0,0 +1,100 @@
+Collections:
+  - Name: QueryInst
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - QueryInst
+    Paper:
+      URL: https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Instances_As_Queries_ICCV_2021_paper.pdf
+      Title: 'Instances as Queries'
+    README: configs/queryinst/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/queryinst.py
+      Version: v2.18.0
+
+Models:
+  - Name: queryinst_r50_fpn_1x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth
+
+  - Name: queryinst_r50_fpn_mstrain_480-800_3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643-7837af86.pth
+
+  - Name: queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802-85cffbd8.pth
+
+  - Name: queryinst_r101_fpn_mstrain_480-800_3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048-91f9995b.pth
+
+  - Name: queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621-76cce59f.pth
diff --git a/configs/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..fd138f5
--- /dev/null
+++ b/configs/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py b/configs/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..07cae19
--- /dev/null
+++ b/configs/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './queryinst_r50_fpn_mstrain_480-800_3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/queryinst/queryinst_r50_fpn_1x_coco.py b/configs/queryinst/queryinst_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..48f5773
--- /dev/null
+++ b/configs/queryinst/queryinst_r50_fpn_1x_coco.py
@@ -0,0 +1,138 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+num_stages = 6
+num_proposals = 100
+model = dict(
+    type='QueryInst',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        add_extra_convs='on_input',
+        num_outs=4),
+    rpn_head=dict(
+        type='EmbeddingRPNHead',
+        num_proposals=num_proposals,
+        proposal_feature_channel=256),
+    roi_head=dict(
+        type='SparseRoIHead',
+        num_stages=num_stages,
+        stage_loss_weights=[1] * num_stages,
+        proposal_feature_channel=256,
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='DIIHead',
+                num_classes=80,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_cls_fcs=1,
+                num_reg_fcs=3,
+                feedforward_channels=2048,
+                in_channels=256,
+                dropout=0.0,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                dynamic_conv_cfg=dict(
+                    type='DynamicConv',
+                    in_channels=256,
+                    feat_channels=64,
+                    out_channels=256,
+                    input_feat_shape=7,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN')),
+                loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                loss_cls=dict(
+                    type='FocalLoss',
+                    use_sigmoid=True,
+                    gamma=2.0,
+                    alpha=0.25,
+                    loss_weight=2.0),
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    clip_border=False,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages)
+        ],
+        mask_head=[
+            dict(
+                type='DynamicMaskHead',
+                dynamic_conv_cfg=dict(
+                    type='DynamicConv',
+                    in_channels=256,
+                    feat_channels=64,
+                    out_channels=256,
+                    input_feat_shape=14,
+                    with_proj=False,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN')),
+                num_convs=4,
+                num_classes=80,
+                roi_feat_size=14,
+                in_channels=256,
+                conv_kernel_size=3,
+                conv_out_channels=256,
+                class_agnostic=False,
+                norm_cfg=dict(type='BN'),
+                upsample_cfg=dict(type='deconv', scale_factor=2),
+                loss_mask=dict(
+                    type='DiceLoss',
+                    loss_weight=8.0,
+                    use_sigmoid=True,
+                    activate=False,
+                    eps=1e-5)) for _ in range(num_stages)
+        ]),
+    # training and testing settings
+    train_cfg=dict(
+        rpn=None,
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    cls_cost=dict(type='FocalLossCost', weight=2.0),
+                    reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                    iou_cost=dict(type='IoUCost', iou_mode='giou',
+                                  weight=2.0)),
+                sampler=dict(type='PseudoSampler'),
+                pos_weight=1,
+                mask_size=28,
+            ) for _ in range(num_stages)
+        ]),
+    test_cfg=dict(
+        rpn=None, rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5)))
+
+# optimizer
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[8, 11], warmup_iters=1000)
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..3089b3c
--- /dev/null
+++ b/configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
@@ -0,0 +1,54 @@
+_base_ = './queryinst_r50_fpn_mstrain_480-800_3x_coco.py'
+num_proposals = 300
+model = dict(
+    rpn_head=dict(num_proposals=num_proposals),
+    test_cfg=dict(
+        _delete_=True,
+        rpn=None,
+        rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# augmentation strategy originates from DETR.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py b/configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..89e2cd1
--- /dev/null
+++ b/configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py
@@ -0,0 +1,23 @@
+_base_ = './queryinst_r50_fpn_1x_coco.py'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+min_values = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, value) for value in min_values],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+
+data = dict(train=dict(pipeline=train_pipeline))
+lr_config = dict(policy='step', step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/regnet/README.md b/configs/regnet/README.md
new file mode 100755
index 0000000..61dba42
--- /dev/null
+++ b/configs/regnet/README.md
@@ -0,0 +1,121 @@
+# RegNet
+
+> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678)
+
+<!-- [BACKBONE] -->
+
+## Abstract
+
+In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143971942-da50f719-61e9-43bd-9468-0dbfbe80284e.png"/>
+</div>
+
+## Introduction
+
+We implement RegNetX and RegNetY models in detection systems and provide their first results on Mask R-CNN, Faster R-CNN and RetinaNet.
+
+The pre-trained models are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md).
+
+## Usage
+
+To use a regnet model, there are two steps to do:
+
+1. Convert the model to ResNet-style supported by MMDetection
+2. Modify backbone and neck in config accordingly
+
+### Convert model
+
+We already prepare models of FLOPs from 400M to 12G in our model zoo.
+
+For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to
+ResNet-style checkpoints used in MMDetection.
+
+```bash
+python -u tools/model_converters/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+### Modify config
+
+The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md).
+The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend).
+This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level.
+For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves.
+
+**Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model.
+
+## Results and Models
+
+### Mask R-CNN
+
+|                                       Backbone                                       |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                               Config                                                               |                                                                                                                                                                                          Download                                                                                                                                                                                          |
+| :----------------------------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                [R-50-FPN](../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py)                 | pytorch |   1x    |   4.4    |      12.0      |  38.2  |  34.7   |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py)           |                                               [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json)                                                |
+|            [RegNetX-3.2GF-FPN](./mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py)             | pytorch |   1x    |   5.0    |                |  40.3  |  36.6   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141.log.json)                           |
+|             [RegNetX-4.0GF-FPN](./mask_rcnn_regnetx-4GF_fpn_1x_coco.py)              | pytorch |   1x    |   5.5    |                |  41.5  |  37.4   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py)         |                               [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217-32e9c92d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217.log.json)                               |
+|               [R-101-FPN](../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py)                | pytorch |   1x    |   6.4    |      10.3      |  40.0  |  36.1   |          [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py)           |                                             [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204_144809.log.json)                                              |
+|            [RegNetX-6.4GF-FPN](./mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py)             | pytorch |   1x    |   6.1    |                |  41.0  |  37.1   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439-3a7aae83.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439.log.json)                           |
+|         [X-101-32x4d-FPN](../mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py)          | pytorch |   1x    |   7.6    |      9.4       |  41.9  |  37.5   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py)        |                                 [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205_034906.log.json)                                  |
+|             [RegNetX-8.0GF-FPN](./mask_rcnn_regnetx-8GF_fpn_1x_coco.py)              | pytorch |   1x    |   6.4    |                |  41.7  |  37.5   |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py)         |                               [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515-09daa87e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515.log.json)                               |
+|             [RegNetX-12GF-FPN](./mask_rcnn_regnetx-12GF_fpn_1x_coco.py)              | pytorch |   1x    |   7.4    |                |  42.2  |   38    |        [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py)        |                             [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552-b538bd8b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552.log.json)                             |
+| [RegNetX-3.2GF-FPN-DCN-C3-C5](./mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py) | pytorch |   1x    |   5.0    |                |  40.3  |  36.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726-75f40794.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726.log.json) |
+
+### Faster R-CNN
+
+|                            Backbone                             |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                         Config                                                          |                                                                                                                                                                    Download                                                                                                                                                                    |
+| :-------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    [R-50-FPN](../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py)    | pytorch |   1x    |   4.0    |      18.2      |  37.4  |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py)    |                   [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json)                    |
+| [RegNetX-3.2GF-FPN](./faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py) | pytorch |   1x    |   4.5    |                |  39.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927-126fd9bf.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927.log.json) |
+| [RegNetX-3.2GF-FPN](./faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py) | pytorch |   2x    |   4.5    |                |  41.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955-e2081918.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955.log.json) |
+
+### RetinaNet
+
+|                           Backbone                            |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                        Config                                                         |                                                                                                                                                                Download                                                                                                                                                                |
+| :-----------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     [R-50-FPN](../retinanet/retinanet_r50_fpn_1x_coco.py)     | pytorch |   1x    |   3.8    |      16.6      |  36.5  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_1x_coco.py)     |                     [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130_002941.log.json)                      |
+| [RegNetX-800MF-FPN](./retinanet_regnetx-800MF_fpn_1x_coco.py) | pytorch |   1x    |   2.5    |                |  35.6  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403-f6f91d10.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403.log.json) |
+| [RegNetX-1.6GF-FPN](./retinanet_regnetx-1.6GF_fpn_1x_coco.py) | pytorch |   1x    |   3.3    |                |  37.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403-37009a9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403.log.json) |
+| [RegNetX-3.2GF-FPN](./retinanet_regnetx-3.2GF_fpn_1x_coco.py) | pytorch |   1x    |   4.2    |                |  39.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141-cb1509e8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141.log.json) |
+
+### Pre-trained models
+
+We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks.
+
+|      Method       |                                   Backbone                                    |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                Config                                                                 |                                                                                                                                                                                                Download                                                                                                                                                                                                |
+| :---------------: | :---------------------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    Faster RCNN    |    [RegNetX-400MF-FPN](./faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py)    | pytorch |   3x    |   2.3    |                |  37.1  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112-e1967c37.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112.log.json)             |
+|    Faster RCNN    |    [RegNetX-800MF-FPN](./faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py)    | pytorch |   3x    |   2.8    |                |  38.8  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118-a2c70b20.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118.log.json)             |
+|    Faster RCNN    |    [RegNetX-1.6GF-FPN](./faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py)    | pytorch |   3x    |   3.4    |                |  40.5  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py)    |                                     [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325-94aa46cc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325.log.json)                                     |
+|    Faster RCNN    |    [RegNetX-3.2GF-FPN](./faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py)    | pytorch |   3x    |   4.4    |                |  42.3  |    -    |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py)    |                                     [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152-e16a5227.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152.log.json)                                     |
+|    Faster RCNN    |      [RegNetX-4GF-FPN](./faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py)      | pytorch |   3x    |   4.9    |                |  42.8  |    -    |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201-65eaf841.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201.log.json)                 |
+|     Mask RCNN     |  [RegNetX-400MF-FPN](./mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py)   | pytorch |   3x    |   2.5    |                |  37.6  |  34.4   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py)     |       [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443-8aac57a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443.log.json)       |
+|     Mask RCNN     |  [RegNetX-800MF-FPN](./mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py)   | pytorch |   3x    |   2.9    |                |  39.5  |  36.1   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py)     |       [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641-715d51f5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641.log.json)       |
+|     Mask RCNN     |  [RegNetX-1.6GF-FPN](./mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py)   | pytorch |   3x    |   3.6    |                |  40.9  |  37.5   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py)     |                                    [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6764cff5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641.log.json)                                    |
+|     Mask RCNN     |     [RegNetX-3.2GF-FPN](./mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py)     | pytorch |   3x    |   5.0    |                |  43.1  |  38.7   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221.log.json)                 |
+|     Mask RCNN     |    [RegNetX-4GF-FPN](./mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py)     | pytorch |   3x    |   5.1    |                |  43.4  |  39.2   |      [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py)      |           [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621-00f0331c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621.log.json)           |
+| Cascade Mask RCNN | [RegNetX-400MF-FPN](./cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py) | pytorch |   3x    |   4.3    |                |  41.6  |  36.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619-5142f449.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619.log.json) |
+| Cascade Mask RCNN | [RegNetX-800MF-FPN](./cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py) | pytorch |   3x    |   4.8    |                |  42.8  |  37.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616-dcbd13f4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616.log.json) |
+| Cascade Mask RCNN | [RegNetX-1.6GF-FPN](./cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py) | pytorch |   3x    |   5.4    |                |  44.5  |  39.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py) |                         [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616-75f29a61.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616.log.json)                         |
+| Cascade Mask RCNN | [RegNetX-3.2GF-FPN](./cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py) | pytorch |   3x    |   6.4    |                |  45.8  |  40.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py) |                         [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616-b9c2c58b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616.log.json)                         |
+| Cascade Mask RCNN |   [RegNetX-4GF-FPN](./cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py)   | pytorch |   3x    |   6.9    |                |  45.8  |  40.0   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py)  |     [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034-cbb1be4c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034.log.json)     |
+
+### Notice
+
+1. The models are trained using a different weight decay, i.e., `weight_decay=5e-5` according to the setting in ImageNet training. This brings improvement of at least 0.7 AP absolute but does not improve the model using ResNet-50.
+2. RetinaNets using RegNets are trained with learning rate 0.02 with gradient clip. We find that using learning rate 0.02 could improve the results by at least 0.7 AP absolute and gradient clip is necessary to stabilize the training. However, this does not improve the performance of ResNet-50-FPN RetinaNet.
+
+## Citation
+
+```latex
+@article{radosavovic2020designing,
+    title={Designing Network Design Spaces},
+    author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár},
+    year={2020},
+    eprint={2003.13678},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
diff --git a/configs/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py b/configs/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..358d85a
--- /dev/null
+++ b/configs/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py b/configs/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..8464571
--- /dev/null
+++ b/configs/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,63 @@
+_base_ = [
+    '../common/mstrain_3x_coco_instance.py',
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py'
+]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+img_norm_cfg = dict(
+    # The mean and std are used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    # Images are converted to float32 directly after loading in PyCls
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+optimizer = dict(weight_decay=0.00005)
diff --git a/configs/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py b/configs/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..2a8990a
--- /dev/null
+++ b/configs/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_400mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 384],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py b/configs/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..3157863
--- /dev/null
+++ b/configs/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py b/configs/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..41376ad
--- /dev/null
+++ b/configs/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py b/configs/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..385b5ca
--- /dev/null
+++ b/configs/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py b/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py
new file mode 100755
index 0000000..88d270e
--- /dev/null
+++ b/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+img_norm_cfg = dict(
+    # The mean and std are used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
diff --git a/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py b/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py
new file mode 100755
index 0000000..612490b
--- /dev/null
+++ b/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py b/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..b7e6e1a
--- /dev/null
+++ b/configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,61 @@
+_base_ = [
+    '../common/mstrain_3x_coco.py', '../_base_/models/faster_rcnn_r50_fpn.py'
+]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+img_norm_cfg = dict(
+    # The mean and std are used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+optimizer = dict(weight_decay=0.00005)
diff --git a/configs/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py b/configs/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..0a05f6e
--- /dev/null
+++ b/configs/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_400mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 384],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py b/configs/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..98b3fc2
--- /dev/null
+++ b/configs/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py b/configs/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..67f448b
--- /dev/null
+++ b/configs/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py b/configs/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..7970c3c
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
+
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py b/configs/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py
new file mode 100755
index 0000000..ce3661c
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_12gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_12gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[224, 448, 896, 2240],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py
new file mode 100755
index 0000000..44bf0d1
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+img_norm_cfg = dict(
+    # The mean and std are used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    # Images are converted to float32 directly after loading in PyCls
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
diff --git a/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py
new file mode 100755
index 0000000..5b53428
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')))
diff --git a/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..aca64d3
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
@@ -0,0 +1,66 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+img_norm_cfg = dict(
+    # The mean and std are used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py b/configs/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..c38dfa6
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_400mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 384],
+        out_channels=256,
+        num_outs=5))
+
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py b/configs/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py
new file mode 100755
index 0000000..874d485
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py b/configs/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..f0b65ea
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
+
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py b/configs/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py
new file mode 100755
index 0000000..99387d8
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_6.4gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_6.4gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[168, 392, 784, 1624],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py b/configs/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py
new file mode 100755
index 0000000..335ebab
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/mstrain-poly_3x_coco_instance.py',
+    '../_base_/models/mask_rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
+
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py b/configs/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py
new file mode 100755
index 0000000..1e7832f
--- /dev/null
+++ b/configs/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_8.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_8.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 720, 1920],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/metafile.yml b/configs/regnet/metafile.yml
new file mode 100755
index 0000000..28bd82f
--- /dev/null
+++ b/configs/regnet/metafile.yml
@@ -0,0 +1,797 @@
+Models:
+  - Name: mask_rcnn_regnetx-3.2GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-4GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217-32e9c92d.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-6.4GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.1
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439-3a7aae83.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-8GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515-09daa87e.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-12GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552-b538bd8b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726-75f40794.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster_rcnn_regnetx-3.2GF_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927-126fd9bf.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster_rcnn_regnetx-3.2GF_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955-e2081918.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: retinanet_regnetx-800MF_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 2.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403-f6f91d10.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: retinanet_regnetx-1.6GF_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.3
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403-37009a9d.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: retinanet_regnetx-3.2GF_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141-cb1509e8.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112-e1967c37.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.8
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118-a2c70b20.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325-94aa46cc.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152-e16a5227.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201-65eaf841.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.5
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443-8aac57a4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641-715d51f5.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.6
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6764cff5.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621-00f0331c.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619-5142f449.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.8
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616-dcbd13f4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616-75f29a61.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616-b9c2c58b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034-cbb1be4c.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
diff --git a/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py b/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py
new file mode 100755
index 0000000..7395c1b
--- /dev/null
+++ b/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py b/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py
new file mode 100755
index 0000000..f05307c
--- /dev/null
+++ b/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py
@@ -0,0 +1,59 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+img_norm_cfg = dict(
+    # The mean and std are used in PyCls when training RegNets
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py b/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py
new file mode 100755
index 0000000..f6f8989
--- /dev/null
+++ b/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
diff --git a/configs/reppoints/README.md b/configs/reppoints/README.md
new file mode 100755
index 0000000..5e71ae5
--- /dev/null
+++ b/configs/reppoints/README.md
@@ -0,0 +1,59 @@
+# RepPoints
+
+> [RepPoints: Point Set Representation for Object Detection](https://arxiv.org/abs/1904.11490)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Modern object detectors rely heavily on rectangular bounding boxes, such as anchors, proposals and the final predictions, to represent objects at various recognition stages. The bounding box is convenient to use but provides only a coarse localization of objects and leads to a correspondingly coarse extraction of object features. In this paper, we present RepPoints(representative points), a new finer representation of objects as a set of sample points useful for both localization and recognition. Given ground truth localization and recognition targets for training, RepPoints learn to automatically arrange themselves in a manner that bounds the spatial extent of an object and indicates semantically significant local areas. They furthermore do not require the use of anchors to sample a space of bounding boxes. We show that an anchor-free object detector based on RepPoints can be as effective as the state-of-the-art anchor-based detection methods, with 46.5 AP and 67.4 AP50 on the COCO test-dev detection benchmark, using ResNet-101 model.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143972514-93247220-4dad-4eb3-a51b-a1115dc7d449.png"/>
+</div>
+
+## Introdution
+
+By [Ze Yang](https://yangze.tech/), [Shaohui Liu](http://b1ueber2y.me/), and [Han Hu](https://ancientmooner.github.io/).
+
+We provide code support and configuration files to reproduce the results in the paper for
+["RepPoints: Point Set Representation for Object Detection"](https://arxiv.org/abs/1904.11490) on COCO object detection.
+
+**RepPoints**, initially described in [arXiv](https://arxiv.org/abs/1904.11490), is a new representation method for visual objects, on which visual understanding tasks are typically centered. Visual object representation, aiming at both geometric description and appearance feature extraction, is conventionally achieved by `bounding box + RoIPool (RoIAlign)`. The bounding box representation is convenient to use; however, it provides only a rectangular localization of objects that lacks geometric precision and may consequently degrade feature quality. Our new representation, RepPoints, models objects by a `point set` instead of a `bounding box`, which learns to adaptively position themselves over an object in a manner that circumscribes the object’s `spatial extent` and enables `semantically aligned feature extraction`. This richer and more flexible representation maintains the convenience of bounding boxes while facilitating various visual understanding applications. This repo demonstrated the effectiveness of RepPoints for COCO object detection.
+
+Another feature of this repo is the demonstration of an `anchor-free detector`, which can be as effective as state-of-the-art anchor-based detection methods. The anchor-free detector can utilize either `bounding box` or `RepPoints` as the basic object representation.
+
+## Results and Models
+
+The results on COCO 2017val are shown in the table below.
+
+|  Method   |   Backbone    | GN  | Anchor | convert func | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                     Config                                                                      |                                                                                                                                                                                                                 Download                                                                                                                                                                                                                  |
+| :-------: | :-----------: | :-: | :----: | :----------: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   BBox    |   R-50-FPN    |  Y  | single |      -       |   1x    |   3.9    |      15.9      |  36.4  |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py)           |                                     [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916.log.json)                                      |
+|   BBox    |   R-50-FPN    |  Y  |  none  |      -       |   1x    |   3.9    |      15.4      |  37.4  |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py)        |                                     [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916.log.json)                                      |
+| RepPoints |   R-50-FPN    |  N  |  none  |    moment    |   1x    |   3.3    |      18.5      |  37.0  |              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py)              |                                                         [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330_233609.log.json)                                                         |
+| RepPoints |   R-50-FPN    |  Y  |  none  |    moment    |   1x    |   3.9    |      17.5      |  38.1  |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py)        |                       [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952-3e51b550.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952.log.json)                        |
+| RepPoints |   R-50-FPN    |  Y  |  none  |    moment    |   2x    |   3.9    |       -        |  38.6  |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329-91babaa2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329_150020.log.json)                           |
+| RepPoints |   R-101-FPN   |  Y  |  none  |    moment    |   2x    |   5.8    |      13.7      |  40.5  |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py)       |                         [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329-4fbc7310.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329_132205.log.json)                         |
+| RepPoints | R-101-FPN-DCN |  Y  |  none  |    moment    |   2x    |   5.9    |      12.1      |  42.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-3309fbf2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329_132134.log.json) |
+| RepPoints | X-101-FPN-DCN |  Y  |  none  |    moment    |   2x    |   7.1    |      9.3       |  44.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-f87da1ea.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329_132201.log.json) |
+
+**Notes:**
+
+- `R-xx`, `X-xx` denote the ResNet and ResNeXt architectures, respectively.
+- `DCN` denotes replacing 3x3 conv with the 3x3 deformable convolution in `c3-c5` stages of backbone.
+- `none` in the `anchor` column means 2-d `center point` (x,y) is used to represent the initial object hypothesis. `single` denotes one 4-d anchor box (x,y,w,h) with IoU based label assign criterion is adopted.
+- `moment`, `partial MinMax`, `MinMax` in the `convert func` column are three functions to convert a point set to a pseudo box.
+- Note the results here are slightly different from those reported in the paper, due to framework change. While the original paper uses an [MXNet](https://mxnet.apache.org/) implementation, we re-implement the method in [PyTorch](https://pytorch.org/) based on mmdetection.
+
+## Citation
+
+```latex
+@inproceedings{yang2019reppoints,
+  title={RepPoints: Point Set Representation for Object Detection},
+  author={Yang, Ze and Liu, Shaohui and Hu, Han and Wang, Liwei and Lin, Stephen},
+  booktitle={The IEEE International Conference on Computer Vision (ICCV)},
+  month={Oct},
+  year={2019}
+}
+```
diff --git a/configs/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py b/configs/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py
new file mode 100755
index 0000000..b24c8db
--- /dev/null
+++ b/configs/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py'
+model = dict(bbox_head=dict(transform_method='minmax', use_grid_points=True))
diff --git a/configs/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py b/configs/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py
new file mode 100755
index 0000000..8d5013d
--- /dev/null
+++ b/configs/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py'
+model = dict(
+    bbox_head=dict(transform_method='minmax', use_grid_points=True),
+    # training and testing settings
+    train_cfg=dict(
+        init=dict(
+            assigner=dict(
+                _delete_=True,
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0,
+                ignore_iof_thr=-1))))
diff --git a/configs/reppoints/metafile.yml b/configs/reppoints/metafile.yml
new file mode 100755
index 0000000..d94137e
--- /dev/null
+++ b/configs/reppoints/metafile.yml
@@ -0,0 +1,181 @@
+Collections:
+  - Name: RepPoints
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Group Normalization
+        - FPN
+        - RepPoints
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.11490
+      Title: 'RepPoints: Point Set Representation for Object Detection'
+    README: configs/reppoints/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/reppoints_detector.py#L9
+      Version: v2.0.0
+
+Models:
+  - Name: bbox_r50_grid_fpn_gn-neck+head_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/bbox_r50_grid_fpn_gn-neck+head_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 62.89
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth
+
+  - Name: bbox_r50_grid_center_fpn_gn-neck+head_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/bbox_r50_grid_center_fpn_gn-neck+head_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 64.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth
+
+  - Name: reppoints_moment_r50_fpn_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.3
+      inference time (ms/im):
+        - value: 54.05
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth
+
+  - Name: reppoints_moment_r50_fpn_gn-neck+head_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 57.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952-3e51b550.pth
+
+  - Name: reppoints_moment_r50_fpn_gn-neck+head_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 57.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329-91babaa2.pth
+
+  - Name: reppoints_moment_r101_fpn_gn-neck+head_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.8
+      inference time (ms/im):
+        - value: 72.99
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329-4fbc7310.pth
+
+  - Name: reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-3309fbf2.pth
+
+  - Name: reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 107.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-f87da1ea.pth
diff --git a/configs/reppoints/reppoints.png b/configs/reppoints/reppoints.png
new file mode 100755
index 0000000..a9306d9
Binary files /dev/null and b/configs/reppoints/reppoints.png differ
diff --git a/configs/reppoints/reppoints_minmax_r50_fpn_gn-neck+head_1x_coco.py b/configs/reppoints/reppoints_minmax_r50_fpn_gn-neck+head_1x_coco.py
new file mode 100755
index 0000000..0f56a46
--- /dev/null
+++ b/configs/reppoints/reppoints_minmax_r50_fpn_gn-neck+head_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py'
+model = dict(bbox_head=dict(transform_method='minmax'))
diff --git a/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py b/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
new file mode 100755
index 0000000..e223d80
--- /dev/null
+++ b/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
@@ -0,0 +1,8 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py b/configs/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py
new file mode 100755
index 0000000..1185470
--- /dev/null
+++ b/configs/reppoints/reppoints_moment_r101_fpn_gn-neck+head_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py b/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..158a906
--- /dev/null
+++ b/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
@@ -0,0 +1,67 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='RepPointsDetector',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type='RepPointsHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        point_feat_channels=256,
+        stacked_convs=3,
+        num_points=9,
+        gradient_mul=0.1,
+        point_strides=[8, 16, 32, 64, 128],
+        point_base_scale=4,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5),
+        loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0),
+        transform_method='moment'),
+    # training and testing settings
+    train_cfg=dict(
+        init=dict(
+            assigner=dict(type='PointAssigner', scale=4, pos_num=1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        refine=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
+optimizer = dict(lr=0.01)
diff --git a/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py b/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py
new file mode 100755
index 0000000..337f167
--- /dev/null
+++ b/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './reppoints_moment_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(neck=dict(norm_cfg=norm_cfg), bbox_head=dict(norm_cfg=norm_cfg))
+optimizer = dict(lr=0.01)
diff --git a/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py b/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py
new file mode 100755
index 0000000..feca44a
--- /dev/null
+++ b/configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py'
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py b/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
new file mode 100755
index 0000000..c0a12d0
--- /dev/null
+++ b/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/reppoints/reppoints_partial_minmax_r50_fpn_gn-neck+head_1x_coco.py b/configs/reppoints/reppoints_partial_minmax_r50_fpn_gn-neck+head_1x_coco.py
new file mode 100755
index 0000000..9a63bd0
--- /dev/null
+++ b/configs/reppoints/reppoints_partial_minmax_r50_fpn_gn-neck+head_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py'
+model = dict(bbox_head=dict(transform_method='partial_minmax'))
diff --git a/configs/res2net/README.md b/configs/res2net/README.md
new file mode 100755
index 0000000..1285870
--- /dev/null
+++ b/configs/res2net/README.md
@@ -0,0 +1,77 @@
+# Res2Net
+
+> [Res2Net: A New Multi-scale Backbone Architecture](https://arxiv.org/abs/1904.01169)
+
+<!-- [BACKBONE] -->
+
+## Abstract
+
+Representing features at multiple scales is of great importance for numerous vision tasks. Recent advances in backbone convolutional neural networks (CNNs) continually demonstrate stronger multi-scale representation ability, leading to consistent performance gains on a wide range of applications. However, most existing methods represent the multi-scale features in a layer-wise manner. In this paper, we propose a novel building block for CNNs, namely Res2Net, by constructing hierarchical residual-like connections within one single residual block. The Res2Net represents multi-scale features at a granular level and increases the range of receptive fields for each network layer. The proposed Res2Net block can be plugged into the state-of-the-art backbone CNN models, e.g., ResNet, ResNeXt, and DLA. We evaluate the Res2Net block on all these models and demonstrate consistent performance gains over baseline models on widely-used datasets, e.g., CIFAR-100 and ImageNet. Further ablation studies and experimental results on representative computer vision tasks, i.e., object detection, class activation mapping, and salient object detection, further verify the superiority of the Res2Net over the state-of-the-art baseline methods.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143972411-8d08113f-9fce-4d24-a138-4fadf2c54f9a.png" height="300"/>
+</div>
+
+## Introduction
+
+We propose a novel building block for CNNs, namely Res2Net, by constructing hierarchical residual-like connections within one single residual block. The Res2Net represents multi-scale features at a granular level and increases the range of receptive fields for each network layer.
+
+|     Backbone      | Params. | GFLOPs | top-1 err. | top-5 err. |
+| :---------------: | :-----: | :----: | :--------: | :--------: |
+|    ResNet-101     | 44.6 M  |  7.8   |   22.63    |    6.44    |
+| ResNeXt-101-64x4d |  83.5M  |  15.5  |   20.40    |     -      |
+|   HRNetV2p-W48    |  77.5M  |  16.1  |   20.70    |    5.50    |
+|    Res2Net-101    |  45.2M  |  8.3   |   18.77    |    4.64    |
+
+Compared with other backbone networks, Res2Net requires fewer parameters and FLOPs.
+
+**Note:**
+
+- GFLOPs for classification are calculated with image size (224x224).
+
+## Results and Models
+
+### Faster R-CNN
+
+|  Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                      Config                                                       |                                                                                                                                               Download                                                                                                                                               |
+| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R2-101-FPN | pytorch |   2x    |   7.4    |       -        |  43.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco_20200514_231734.log.json) |
+
+### Mask R-CNN
+
+|  Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                     Config                                                      |                                                                                                                                           Download                                                                                                                                           |
+| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R2-101-FPN | pytorch |   2x    |   7.9    |       -        |  43.6  |  38.7   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/mask_rcnn_r2_101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco-17f061e8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco_20200515_002413.log.json) |
+
+### Cascade R-CNN
+
+|  Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                       Config                                                        |                                                                                                                                                   Download                                                                                                                                                   |
+| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R2-101-FPN | pytorch |   20e   |   7.8    |       -        |  45.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco-f4b7b7db.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco_20200515_091644.log.json) |
+
+### Cascade Mask R-CNN
+
+|  Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                          Config                                                          |                                                                                                                                                             Download                                                                                                                                                             |
+| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R2-101-FPN | pytorch |   20e   |   9.5    |       -        |  46.4  |  40.0   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco_20200515_091645.log.json) |
+
+### Hybrid Task Cascade (HTC)
+
+|  Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                   Config                                                   |                                                                                                                                 Download                                                                                                                                 |
+| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R2-101-FPN | pytorch |   20e   |    -     |       -        |  47.5  |  41.6   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/htc_r2_101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco-3a8d2112.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco_20200515_150029.log.json) |
+
+- Res2Net ImageNet pretrained models are in [Res2Net-PretrainedModels](https://github.com/Res2Net/Res2Net-PretrainedModels).
+- More applications of Res2Net are in [Res2Net-Github](https://github.com/Res2Net/).
+
+## Citation
+
+```latex
+@article{gao2019res2net,
+  title={Res2Net: A New Multi-scale Backbone Architecture},
+  author={Gao, Shang-Hua and Cheng, Ming-Ming and Zhao, Kai and Zhang, Xin-Yu and Yang, Ming-Hsuan and Torr, Philip},
+  journal={IEEE TPAMI},
+  year={2020},
+  doi={10.1109/TPAMI.2019.2938758},
+}
+```
diff --git a/configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py b/configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py
new file mode 100755
index 0000000..6b6c001
--- /dev/null
+++ b/configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/configs/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py b/configs/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py
new file mode 100755
index 0000000..10dddbb
--- /dev/null
+++ b/configs/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py b/configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py
new file mode 100755
index 0000000..fc2221c
--- /dev/null
+++ b/configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/configs/res2net/htc_r2_101_fpn_20e_coco.py b/configs/res2net/htc_r2_101_fpn_20e_coco.py
new file mode 100755
index 0000000..22d0c5d
--- /dev/null
+++ b/configs/res2net/htc_r2_101_fpn_20e_coco.py
@@ -0,0 +1,13 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/res2net/mask_rcnn_r2_101_fpn_2x_coco.py b/configs/res2net/mask_rcnn_r2_101_fpn_2x_coco.py
new file mode 100755
index 0000000..33aef1a
--- /dev/null
+++ b/configs/res2net/mask_rcnn_r2_101_fpn_2x_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/configs/res2net/metafile.yml b/configs/res2net/metafile.yml
new file mode 100755
index 0000000..27bac8c
--- /dev/null
+++ b/configs/res2net/metafile.yml
@@ -0,0 +1,146 @@
+Models:
+  - Name: faster_rcnn_r2_101_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: mask_rcnn_r2_101_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/res2net/mask_rcnn_r2_101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco-17f061e8.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: cascade_rcnn_r2_101_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/res2net/cascade_rcnn_r2_101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco-f4b7b7db.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: cascade_mask_rcnn_r2_101_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 9.5
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: htc_r2_101_fpn_20e_coco
+    In Collection: HTC
+    Config: configs/res2net/htc_r2_101_fpn_20e_coco.py
+    Metadata:
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco-3a8d2112.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
diff --git a/configs/resnest/README.md b/configs/resnest/README.md
new file mode 100755
index 0000000..3676e56
--- /dev/null
+++ b/configs/resnest/README.md
@@ -0,0 +1,54 @@
+# ResNeSt
+
+> [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955)
+
+<!-- [BACKBONE] -->
+
+## Abstract
+
+It is well known that featuremap attention and multi-path representation are important for visual recognition. In this paper, we present a modularized architecture, which applies the channel-wise attention on different network branches to leverage their success in capturing cross-feature interactions and learning diverse representations. Our design results in a simple and unified computation block, which can be parameterized using only a few variables. Our model, named ResNeSt, outperforms EfficientNet in accuracy and latency trade-off on image classification. In addition, ResNeSt has achieved superior transfer learning results on several public benchmarks serving as the backbone, and has been adopted by the winning entries of COCO-LVIS challenge.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143973475-b5b33b15-ed04-4fc6-890a-521f1a62bc52.png"/>
+</div>
+
+## Results and Models
+
+### Faster R-CNN
+
+| Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                       Config                                                                       |                                                                                                                                                                                                                             Download                                                                                                                                                                                                                             |
+| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| S-50-FPN  | pytorch |   1x    |   4.8    |       -        |  42.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20200926_125502.log.json)   |
+| S-101-FPN | pytorch |   1x    |   7.1    |       -        |  44.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201006_021058-421517f1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201006_021058.log.json) |
+
+### Mask R-CNN
+
+| Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                   Config                                                                   |                                                                                                                                                                                                             Download                                                                                                                                                                                                             |
+| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| S-50-FPN  | pytorch |   1x    |   5.5    |       -        |  42.6  |  38.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20200926_125503-8a2c3d47.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20200926_125503.log.json)   |
+| S-101-FPN | pytorch |   1x    |   7.8    |       -        |  45.2  |  40.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_215831-af60cdf9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201005_215831.log.json) |
+
+### Cascade R-CNN
+
+| Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                       Config                                                                        |                                                                                                                                                                                                                              Download                                                                                                                                                                                                                              |
+| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| S-50-FPN  | pytorch |   1x    |    -     |       -        |  44.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201122_213640-763cc7b5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201005_113242.log.json) |
+| S-101-FPN | pytorch |   1x    |   8.4    |       -        |  46.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201005_113242-b9459f8f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201122_213640.log.json) |
+
+### Cascade Mask R-CNN
+
+| Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                       Config                                                                       |                                                                                                                                                                                                                             Download                                                                                                                                                                                                                             |
+| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| S-50-FPN  | pytorch |   1x    |    -     |       -        |  45.4  |  39.5   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201122_104428-99eca4c7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201122_104428.log.json)   |
+| S-101-FPN | pytorch |   1x    |   10.5   |       -        |  47.7  |  41.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_113243-42607475.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201005_113243.log.json) |
+
+## Citation
+
+```latex
+@article{zhang2020resnest,
+title={ResNeSt: Split-Attention Networks},
+author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
+journal={arXiv preprint arXiv:2004.08955},
+year={2020}
+}
+```
diff --git a/configs/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py
new file mode 100755
index 0000000..406f39d
--- /dev/null
+++ b/configs/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/configs/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py
new file mode 100755
index 0000000..83d7537
--- /dev/null
+++ b/configs/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py
@@ -0,0 +1,118 @@
+_base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(norm_cfg=norm_cfg)))
+# # use ResNeSt img_norm
+img_norm_cfg = dict(
+    mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
new file mode 100755
index 0000000..0a7476a
--- /dev/null
+++ b/configs/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/configs/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
new file mode 100755
index 0000000..6ed7730
--- /dev/null
+++ b/configs/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
@@ -0,0 +1,116 @@
+_base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ], ))
+# # use ResNeSt img_norm
+img_norm_cfg = dict(
+    mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=False,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
new file mode 100755
index 0000000..40a2f1f
--- /dev/null
+++ b/configs/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/configs/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py b/configs/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
new file mode 100755
index 0000000..eb1ecd2
--- /dev/null
+++ b/configs/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
@@ -0,0 +1,62 @@
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg)))
+# # use ResNeSt img_norm
+img_norm_cfg = dict(
+    mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=False,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py
new file mode 100755
index 0000000..c882ba1
--- /dev/null
+++ b/configs/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/configs/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py b/configs/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py
new file mode 100755
index 0000000..4e50dea
--- /dev/null
+++ b/configs/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py
@@ -0,0 +1,64 @@
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+# # use ResNeSt img_norm
+img_norm_cfg = dict(
+    mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/resnest/metafile.yml b/configs/resnest/metafile.yml
new file mode 100755
index 0000000..cfeec71
--- /dev/null
+++ b/configs/resnest/metafile.yml
@@ -0,0 +1,230 @@
+Models:
+  - Name: faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/resnest/faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.8
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/resnest/faster_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201006_021058-421517f1.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/resnest/mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20200926_125503-8a2c3d47.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/resnest/mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_215831-af60cdf9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
+    Metadata:
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201122_213640-763cc7b5.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade_rcnn_s101_fpn_syncbn-backbone+head_mstrain-range_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201005_113242-b9459f8f.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py
+    Metadata:
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201122_104428-99eca4c7.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_113243-42607475.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
diff --git a/configs/resnet_strikes_back/README.md b/configs/resnet_strikes_back/README.md
new file mode 100755
index 0000000..dd00b20
--- /dev/null
+++ b/configs/resnet_strikes_back/README.md
@@ -0,0 +1,40 @@
+# ResNet strikes back
+
+> [ResNet strikes back: An improved training procedure in timm](https://arxiv.org/abs/2110.00476)
+
+<!-- [OTHERS] -->
+
+## Abstract
+
+The influential Residual Networks designed by He et al. remain the gold-standard architecture in numerous scientific publications. They typically serve as the default architecture in studies, or as baselines when new architectures are proposed. Yet there has been significant progress on best practices for training neural networks since the inception of the ResNet architecture in 2015. Novel optimization & dataaugmentation have increased the effectiveness of the training recipes.
+
+In this paper, we re-evaluate the performance of the vanilla ResNet-50 when trained with a procedure that integrates such advances. We share competitive training settings and pre-trained models in the timm open-source library, with the hope that they will serve as better baselines for future work. For instance, with our more demanding training setting, a vanilla ResNet-50 reaches 80.4% top-1 accuracy at resolution 224×224 on ImageNet-val without extra data or distillation. We also report the performance achieved with popular models with our training procedure.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/12907710/149324625-4546a5a7-704f-406c-982f-0376a20d03d8.png"/>
+</div>
+
+## Results and Models
+
+|       Method       | Backbone | Lr schd | Mem (GB) | Inf time (fps) |   box AP    |   mask AP   |                            Config                             |                                                                                                                                                                                                   Download                                                                                                                                                                                                   |
+| :----------------: | :------: | :-----: | :------: | :------------: | :---------: | :---------: | :-----------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    Faster R-CNN    | R-50 rsb |   1x    |   3.9    |       -        | 40.8 (+3.4) |      -      |    [Config](./faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229.log.json)             |
+|     Mask R-CNN     | R-50 rsb |   1x    |   4.5    |       -        | 41.2 (+3.0) | 38.2 (+3.0) |     [Config](./mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054.log.json)                 |
+| Cascade Mask R-CNN | R-50 rsb |   1x    |   6.2    |       -        | 44.8 (+3.6) | 39.9 (+3.6) | [Config](./cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636.log.json) |
+|     RetinaNet      | R-50 rsb |   1x    |   3.8    |       -        | 39.0 (+2.5) |      -      |     [Config](./retinanet_r50_fpn_rsb-pretrain_1x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432.log.json)                 |
+
+**Notes:**
+
+- 'rsb' is short for 'resnet strikes back'
+- We have done some grid searches on learning rate and weight decay and get these optimal hyper-parameters.
+
+## Citation
+
+```latex
+@article{wightman2021resnet,
+title={Resnet strikes back: An improved training procedure in timm},
+author={Ross Wightman, Hugo Touvron, Hervé Jégou},
+journal={arXiv preprint arXiv:2110.00476},
+year={2021}
+}
+```
diff --git a/configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100755
index 0000000..8b601f0
--- /dev/null
+++ b/configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0002,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100755
index 0000000..fe86684
--- /dev/null
+++ b/configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0002,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100755
index 0000000..321d98e
--- /dev/null
+++ b/configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0002,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/resnet_strikes_back/metafile.yml b/configs/resnet_strikes_back/metafile.yml
new file mode 100755
index 0000000..4c85a16
--- /dev/null
+++ b/configs/resnet_strikes_back/metafile.yml
@@ -0,0 +1,116 @@
+Models:
+  - Name: faster_rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: retinanet_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: RetinaNet
+    Config: configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: mask_rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
diff --git a/configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100755
index 0000000..480697a
--- /dev/null
+++ b/configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/retinanet/README.md b/configs/retinanet/README.md
new file mode 100755
index 0000000..b9e0a2a
--- /dev/null
+++ b/configs/retinanet/README.md
@@ -0,0 +1,53 @@
+# RetinaNet
+
+> [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143973551-2b8e766a-1677-4f6d-953d-2e6d2a3c67b5.png" height="300"/>
+</div>
+
+## Results and Models
+
+|    Backbone     |  Style  |   Lr schd    | Mem (GB) | Inf time (fps) | box AP |                                                        Config                                                         |                                                                                                                                                         Download                                                                                                                                                          |
+| :-------------: | :-----: | :----------: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-18-FPN     | pytorch |      1x      |   1.7    |                |  31.7  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r18_fpn_1x_coco.py)     |           [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055.log.json)            |
+|    R-18-FPN     | pytorch | 1x(1 x 8 BS) |   5.0    |                |  31.7  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py)   |   [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255-4ea310d7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255.log.json)    |
+|    R-50-FPN     |  caffe  |      1x      |   3.5    |      18.6      |  36.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531_012518.log.json)   |
+|    R-50-FPN     | pytorch |      1x      |   3.8    |      19.0      |  36.5  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130_002941.log.json)               |
+| R-50-FPN (FP16) | pytorch |      1x      |   2.8    |      31.6      |  36.4  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py)  |          [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702-0dbfb212.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702_020127.log.json)          |
+|    R-50-FPN     | pytorch |      2x      |    -     |       -        |  37.4  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131_114738.log.json)               |
+|    R-101-FPN    |  caffe  |      1x      |   5.5    |      14.7      |  38.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531-b428fa0f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531_012536.log.json) |
+|    R-101-FPN    | pytorch |      1x      |   5.7    |      15.0      |  38.5  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_fpn_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130-7a93545f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130_003055.log.json)             |
+|    R-101-FPN    | pytorch |      2x      |    -     |       -        |  38.9  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_fpn_2x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131-5560aee8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131_114859.log.json)             |
+| X-101-32x4d-FPN | pytorch |      1x      |   7.0    |      12.1      |  39.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130-5c8b7ec4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130_003004.log.json) |
+| X-101-32x4d-FPN | pytorch |      2x      |    -     |       -        |  40.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131-237fc5e1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131_114812.log.json) |
+| X-101-64x4d-FPN | pytorch |      1x      |   10.0   |      8.7       |  41.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130-366f5af1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130_003008.log.json) |
+| X-101-64x4d-FPN | pytorch |      2x      |    -     |       -        |  40.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131-bca068ab.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131_114833.log.json) |
+
+## Pre-trained Models
+
+We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks.
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | box AP |                                                                Config                                                                 |                                                                                                                                                                                 Download                                                                                                                                                                                  |
+| :-------------: | :-----: | :-----: | :------: | :----: | :-----------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |   3x    |   3.5    |  39.5  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.log.json)               |
+|    R-101-FPN    |  caffe  |   3x    |   5.4    |  40.7  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py)     | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.log.json) |
+|    R-101-FPN    | pytorch |   3x    |   5.4    |   41   |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.log.json)             |
+| X-101-64x4d-FPN | pytorch |   3x    |   9.8    |  41.6  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{lin2017focal,
+  title={Focal loss for dense object detection},
+  author={Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  year={2017}
+}
+```
diff --git a/configs/retinanet/ascend_retinanet_r18_fpn_1x8_1x_coco.py b/configs/retinanet/ascend_retinanet_r18_fpn_1x8_1x_coco.py
new file mode 100755
index 0000000..8643c60
--- /dev/null
+++ b/configs/retinanet/ascend_retinanet_r18_fpn_1x8_1x_coco.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/ascend_retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# data
+data = dict(samples_per_gpu=8)
+
+# optimizer
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
+
+# Note: If the learning rate is set to 0.0025, the mAP will be 32.4.
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (1 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/retinanet/metafile.yml b/configs/retinanet/metafile.yml
new file mode 100755
index 0000000..8751cbb
--- /dev/null
+++ b/configs/retinanet/metafile.yml
@@ -0,0 +1,312 @@
+Collections:
+  - Name: RetinaNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Focal Loss
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1708.02002
+      Title: "Focal Loss for Dense Object Detection"
+    README: configs/retinanet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/retinanet.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: retinanet_r18_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r18_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 1.7
+      Training Resources: 8x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 31.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth
+
+  - Name: retinanet_r18_fpn_1x8_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Training Resources:  1x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 31.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255-4ea310d7.pth
+
+  - Name: retinanet_r50_caffe_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      inference time (ms/im):
+        - value: 53.76
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth
+
+  - Name: retinanet_r50_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      inference time (ms/im):
+        - value: 52.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth
+
+  - Name: retinanet_r50_fpn_fp16_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py
+    Metadata:
+      Training Memory (GB): 2.8
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      inference time (ms/im):
+        - value: 31.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702-0dbfb212.pth
+
+  - Name: retinanet_r50_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth
+
+  - Name: retinanet_r50_fpn_mstrain_640-800_3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.pth
+
+  - Name: retinanet_r101_caffe_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531-b428fa0f.pth
+
+  - Name: retinanet_r101_caffe_fpn_mstrain_3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.pth
+
+  - Name: retinanet_r101_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130-7a93545f.pth
+
+  - Name: retinanet_r101_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131-5560aee8.pth
+
+  - Name: retinanet_r101_fpn_mstrain_640-800_3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.pth
+
+  - Name: retinanet_x101_32x4d_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130-5c8b7ec4.pth
+
+  - Name: retinanet_x101_32x4d_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131-237fc5e1.pth
+
+  - Name: retinanet_x101_64x4d_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      inference time (ms/im):
+        - value: 114.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130-366f5af1.pth
+
+  - Name: retinanet_x101_64x4d_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      inference time (ms/im):
+        - value: 114.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131-bca068ab.pth
+
+  - Name: retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.pth
diff --git a/configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py b/configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..56eaae2
--- /dev/null
+++ b/configs/retinanet/retinanet_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './retinanet_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py b/configs/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..b87295e
--- /dev/null
+++ b/configs/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './retinanet_r50_caffe_fpn_mstrain_1x_coco.py'
+# learning policy
+model = dict(
+    pretrained='open-mmlab://detectron2/resnet101_caffe',
+    backbone=dict(depth=101))
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/retinanet/retinanet_r101_fpn_1x_coco.py b/configs/retinanet/retinanet_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..a7f0600
--- /dev/null
+++ b/configs/retinanet/retinanet_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/retinanet/retinanet_r101_fpn_2x_coco.py b/configs/retinanet/retinanet_r101_fpn_2x_coco.py
new file mode 100755
index 0000000..721112a
--- /dev/null
+++ b/configs/retinanet/retinanet_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py b/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py
new file mode 100755
index 0000000..6bbcac4
--- /dev/null
+++ b/configs/retinanet/retinanet_r101_fpn_mstrain_640-800_3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py', '../common/mstrain_3x_coco.py'
+]
+# optimizer
+model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py b/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py
new file mode 100755
index 0000000..01a35f2
--- /dev/null
+++ b/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# data
+data = dict(samples_per_gpu=8)
+
+# optimizer
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
+
+# Note: If the learning rate is set to 0.0025, the mAP will be 32.4.
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (1 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/retinanet/retinanet_r18_fpn_1x_coco.py b/configs/retinanet/retinanet_r18_fpn_1x_coco.py
new file mode 100755
index 0000000..6197b32
--- /dev/null
+++ b/configs/retinanet/retinanet_r18_fpn_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# optimizer
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py b/configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..04c9af5
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,41 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py b/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py
new file mode 100755
index 0000000..4d7b8f2
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py
@@ -0,0 +1,46 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_2x_coco.py b/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..eea9690
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './retinanet_r50_caffe_fpn_mstrain_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 23])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_3x_coco.py b/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_3x_coco.py
new file mode 100755
index 0000000..8057650
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './retinanet_r50_caffe_fpn_mstrain_1x_coco.py'
+# learning policy
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/retinanet/retinanet_r50_fpn_1x_coco.py b/configs/retinanet/retinanet_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..04bd696
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/retinanet/retinanet_r50_fpn_2x_coco.py b/configs/retinanet/retinanet_r50_fpn_2x_coco.py
new file mode 100755
index 0000000..927915f
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_fpn_2x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/retinanet/retinanet_r50_fpn_90k_coco.py b/configs/retinanet/retinanet_r50_fpn_90k_coco.py
new file mode 100755
index 0000000..ceda327
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_fpn_90k_coco.py
@@ -0,0 +1,15 @@
+_base_ = 'retinanet_r50_fpn_1x_coco.py'
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[60000, 80000])
+
+# Runner type
+runner = dict(_delete_=True, type='IterBasedRunner', max_iters=90000)
+
+checkpoint_config = dict(interval=10000)
+evaluation = dict(interval=10000, metric='bbox')
diff --git a/configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py b/configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py
new file mode 100755
index 0000000..cf598ad
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+# fp16 settings
+fp16 = dict(loss_scale=512.)
+
+# set grad_norm for stability during mixed-precision training
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py b/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py
new file mode 100755
index 0000000..02a2c29
--- /dev/null
+++ b/configs/retinanet/retinanet_r50_fpn_mstrain_640-800_3x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py', '../common/mstrain_3x_coco.py'
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py b/configs/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..765a4c2
--- /dev/null
+++ b/configs/retinanet/retinanet_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py b/configs/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..14de96f
--- /dev/null
+++ b/configs/retinanet/retinanet_x101_32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py b/configs/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..948cd18
--- /dev/null
+++ b/configs/retinanet/retinanet_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py b/configs/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..ad04b6e
--- /dev/null
+++ b/configs/retinanet/retinanet_x101_64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py b/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py
new file mode 100755
index 0000000..f6ab512
--- /dev/null
+++ b/configs/retinanet/retinanet_x101_64x4d_fpn_mstrain_640-800_3x_coco.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py', '../common/mstrain_3x_coco.py'
+]
+# optimizer
+model = dict(
+    pretrained='open-mmlab://resnext101_64x4d',
+    backbone=dict(type='ResNeXt', depth=101, groups=64, base_width=4))
+optimizer = dict(type='SGD', lr=0.01)
diff --git a/configs/rfnext/README.md b/configs/rfnext/README.md
new file mode 100755
index 0000000..13f3991
--- /dev/null
+++ b/configs/rfnext/README.md
@@ -0,0 +1,131 @@
+# RF-Next: Efficient Receptive Field Search for CNN
+
+> [RF-Next: Efficient Receptive Field Search for Convolutional Neural Networks](http://mftp.mmcheng.net/Papers/22TPAMI-ActionSeg.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Temporal/spatial receptive fields of models play an important role in sequential/spatial tasks. Large receptive fields facilitate long-term relations, while small receptive fields help to capture the local details. Existing methods construct models with hand-designed receptive fields in layers. Can we effectively search for receptive field combinations to replace hand-designed patterns? To answer this question, we propose to find better receptive field combinations through a global-to-local search scheme. Our search scheme exploits both global search to find the coarse combinations and local search to get the refined receptive field combinations further. The global search finds possible coarse combinations other than human-designed patterns. On top of the global search, we propose an expectation-guided iterative local search scheme to refine combinations effectively. Our RF-Next models, plugging receptive field search to various models, boost the performance on many tasks, e.g., temporal action segmentation, object detection, instance segmentation, and speech synthesis.
+The source code is publicly available on [http://mmcheng.net/rfnext](http://mmcheng.net/rfnext).
+
+## Results and Models
+
+### ConvNext on COCO
+
+|   Backbone    |       Method       |     RFNext      | Lr Schd | box mAP | mask mAP |                                                                                                                                                                             Config                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                    Download                                                                                                                                                                                                                                                                                                                                    |
+| :-----------: | :----------------: | :-------------: | :-----: | :-----: | :------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  ConvNeXt-T   | Cascade Mask R-CNN |       NO        |   3x    |  50.3   |   43.6   |                                                                                                             [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco.py)                                                                                                             |                                                                                                                                   [model](https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953-050731f4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953.log.json)                                                                                                                                   |
+| RF-ConvNeXt-T | Cascade Mask R-CNN |  Single-Branch  |   3x    |  50.6   |   44.0   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k-71aeb991.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k_20220131_091748.log.json) |
+| RF-ConvNeXt-T | Cascade Mask R-CNN | Multiple-Branch |   3x    |  50.9   |   44.3   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py)  |                           [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k/rfnext_search_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k-f47db42b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k/rfnext_search_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k_20220128_200900.log.json)                           |
+
+### PVTv2 on COCO
+
+|  Backbone   |   Method   |     RFNext      | Lr Schd | box mAP | mask mAP |                                                                                                                                   Config                                                                                                                                    |                                                                                                                                                                                                    Download                                                                                                                                                                                                    |
+| :---------: | :--------: | :-------------: | :-----: | :-----: | :------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  PVTv2-b0   | Mask R-CNN |       NO        |   1x    |  38.2   |   36.2   |                                                                                                                                      -                                                                                                                                      |                                                                                                                                                                                                       -                                                                                                                                                                                                        |
+| RF-PVTv2-b0 | Mask R-CNN |  Single-Branch  |   1x    |  38.9   |   36.8   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco-7b25d72e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco_20221120_213845.log.json) |
+| RF-PVTv2-b0 | Mask R-CNN | Multiple-Branch |   1x    |  39.3   |   37.1   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_multi_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py)  |                           [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco-dc8fd5de.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco_20221119_204703.log.json)                           |
+
+The results of PVTv2-b0 are from [PVT](https://github.com/whai362/PVT/tree/v2/detection).
+
+### Res2Net on COCO
+
+|    Backbone    |       Method       |     RFNext      | Lr Schd | box mAP | mask mAP |                                                                                                                                           Config                                                                                                                                           |                                                                                                                                                                                                                  Download                                                                                                                                                                                                                  |
+| :------------: | :----------------: | :-------------: | :-----: | :-----: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Res2Net-101   | Cascade Mask R-CNN |       NO        |   20e   |  46.4   |   40.0   |                                                                                  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py)                                                                                  |                                                      [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco_20200515_091645.log.json)                                                      |
+| RF-Res2Net-101 | Cascade Mask R-CNN |  Single-Branch  |   20e   |  46.9   |   40.7   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco.py)  [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco-e22d5257.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco_20220402_141321.log.json) |
+| RF-Res2Net-101 | Cascade Mask R-CNN | Multiple-Branch |   20e   |  47.9   |   41.5   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco.py)  [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py)  |                           [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco-e17510a0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco_20220327_221419.log.json)                           |
+
+### HRNet on COCO
+
+|    Backbone     |       Method       |     RFNext      | Lr Schd | box mAP | mask mAP |                                                                                                                                                Config                                                                                                                                                 |                                                                                                                                                                                                                      Download                                                                                                                                                                                                                      |
+| :-------------: | :----------------: | :-------------: | :-----: | :-----: | :------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  HRNetV2p-W18   | Cascade Mask R-CNN |       NO        |   20e   |  41.6   |   36.4   |                                                                                       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py)                                                                                        |                                                   [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210_093149.log.json)                                                    |
+| RF-HRNetV2p-W18 | Cascade Mask R-CNN |  Single-Branch  |   20e   |  43.0   |   37.6   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfsearched_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfsearched_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco-682f121d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20221118_141400.log.json) |
+| RF-HRNetV2p-W18 | Cascade Mask R-CNN | Multiple-Branch |   20e   |  43.7   |   38.2   | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfsearched_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfsearched_fixed_multi_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py)  |                           [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco-7b9c7885.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20221115_230113.log.json)                           |
+
+Note: the performance of multi-branch models listed above are evaluated during searching to save computional cost, retraining would achieve similar or better performance.
+
+### Res2Net on COCO panoptic
+
+|   Backbone    |    Method    |     RFNext      | Lr schd |  PQ  |  SQ  |  RQ  |                                                                                                                                        Config                                                                                                                                         |                                                                                                                                                                                                              Download                                                                                                                                                                                                              |
+| :-----------: | :----------: | :-------------: | :-----: | :--: | :--: | :--: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Res2Net-50   | Panoptic FPN |       NO        |   1x    | 42.5 | 78.0 | 51.8 |                                                                              [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py)                                                                              |                                                       [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/panoptic_fpn_r2_50_fpn_fp16_1x_coco/panoptic_fpn_r2_50_fpn_fp16_1x_coco-fa6c51f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/panoptic_fpn_r2_50_fpn_fp16_1x_coco/panoptic_fpn_r2_50_fpn_fp16_1x_coco_20221114_224729.log.json)                                                       |
+| RF-Res2Net-50 | Panoptic FPN |  Single-Branch  |   1x    | 44.0 | 78.7 | 53.6 | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco-52181d5b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco_20221115_152436.log.json) |
+| RF-Res2Net-50 | Panoptic FPN | Multiple-Branch |   1x    | 44.3 | 79.0 | 53.9 | [search](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py) [retrain](https://github.com/open-mmlab/mmdetection/tree/master/configs/rfnext/rfnext_fixed_multi_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py)  |                           [model](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco-34a893a0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco_20221114_224722.log.json)                           |
+
+## Configs
+
+If you want to search receptive fields on an existing model, you need to define a `RFSearchHook` in the `custom_hooks` of config file.
+
+```python
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='search',
+        rfstructure_file=None,
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=11,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[]))
+        ),
+]
+```
+
+Arguments:
+
+- `max_step`: The maximum number of steps to update the structures.
+- `search_interval`: The interval (epoch) between two updates.
+- `exp_rate`:  The controller of the sparsity of search space. For a conv with an initial dilation rate of `D`, dilation rates will be sampled with an interval of `exp_rate * D`.
+- `num_branches`: The controller of the size of search space (the number of branches). If you set `S=3`, the dilations are `[D - exp_rate * D, D, D + exp_rate * D]` for three branches. If you set `num_branches=2`, the dilations are `[D - exp_rate * D, D + exp_rate * D]`. With `num_branches=2`, you can achieve similar performance with less MEMORY and FLOPS.
+- `skip_layer`: The modules in skip_layer will be ignored during the receptive field search.
+
+## Training
+
+### 1. Searching Jobs
+
+You can launch searching jobs by using config files with prefix `rfnext_search`. The json files of searched structures will be saved to `work_dir`.
+
+If you want to further search receptive fields upon a searched structure, please set `rfsearch_cfg.rfstructure_file` in config file to the corresponding json file.
+
+### 2. Training Jobs
+
+Setting `rfsearch_cfg.rfstructure_file` to the searched structure file (.json) and setting `rfsearch_cfg.mode` to `fixed_single_branch` or `fixed_multi_branch`, you can retrain a model with the searched structure.
+You can launch fixed_single_branch/fixed_multi_branch training jobs by using config files with prefix `rfnext_fixed_single_branch` or `rfnext_fixed_multi_branch`.
+
+Note that the models after the searching stage is ready a `fixed_multi_branch` version, which achieves better performance than `fixed_single_branch`, without any retraining.
+
+## Inference
+
+`rfsearch_cfg.rfstructure_file` and `rfsearch_cfg.mode` should be set for inferencing stage.
+
+**Note:For the models trained with modes of `fixed_single_branch` or `fixed_multi_branch`, you can just use the training config for inferencing.**
+**But If you want to inference the models trained with the mode of `search`, please use the config with prefix of `rfnext_fixed_multi_branch` to inference the models. (Otherwise, you should set `rfsearch_cfg.mode` to `fixed_multi_branch` and set the searched rfstructure_file.)**
+
+## Citation
+
+```
+@article{gao2022rfnext,
+title={RF-Next: Efficient Receptive Field Search for Convolutional Neural Networks},
+author={Gao, Shanghua and Li, Zhong-Yu and Han, Qi and Cheng, Ming-Ming and Wang, Liang},
+journal=TPAMI,
+year={2022}
+}
+
+@inproceedings{gao2021global2local,
+  title     = {Global2Local: Efficient Structure Search for Video Action Segmentation},
+  author    = {Gao, Shanghua and Han, Qi and Li, Zhong-Yu and Peng, Pai and Wang, Liang and Cheng, Ming-Ming},
+  booktitle = CVPR,
+  year      = {2021}
+}
+```
diff --git a/configs/rfnext/metafile.yml b/configs/rfnext/metafile.yml
new file mode 100755
index 0000000..59469d7
--- /dev/null
+++ b/configs/rfnext/metafile.yml
@@ -0,0 +1,249 @@
+Collections:
+  - Name: RF-Next
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - RF-Next
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RF-Next
+    Paper:
+      URL: http://mftp.mmcheng.net/Papers/22TPAMI-ActionSeg.pdf
+      Title: "RF-Next: Efficient Receptive Field Search for Convolutional Neural Networks"
+    README: configs/rfnext/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/mmdet/utils/rfnext.py
+      Version: v2.27.0
+
+Models:
+  - Name: rfnext_search_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_search_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
+    Metadata:
+      Training Memory (GB): 11.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (search)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 44.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k/rfnext_search_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k-f47db42b.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_fixed_single_branch_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (fixed_single_branch)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 44.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in1k-71aeb991.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
+    Metadata:
+      Training Memory (GB): 12.9
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (search)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco-7b9c7885.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
+    Metadata:
+      Training Memory (GB): 8.4
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (fixed_single_branch)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco-682f121d.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 11.9
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (search)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco-e17510a0.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 9.3
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (fixed_single_branch)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco-e22d5257.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (search)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco-dc8fd5de.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.3
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (fixed_single_branch)
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco-7b25d72e.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (search)
+    Results:
+      - Task: Panoptic Segmentation
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco-34a893a0.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
+
+  - Name: rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco
+    In Collection: RF-Next
+    Config: configs/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        RF-Next (fixed_single_branch)
+    Results:
+      - Task: Panoptic Segmentation
+        Dataset: COCO
+        Metrics:
+          box AP: 44.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco-52181d5b.pth
+    Paper:
+      URL: https://arxiv.org/pdf/2206.06637.pdf
+      Title: 'RF-Next: Efficient Receptive Field Search for CNN'
+    README: configs/rfnext/README.md
diff --git a/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py b/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
new file mode 100755
index 0000000..76b499c
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
@@ -0,0 +1,23 @@
+_base_ = '../convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py'  # noqa
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='fixed_multi_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/convnext_cascade_maskrcnn/local_search_config_step11.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                normlize='absavg',
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py b/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
new file mode 100755
index 0000000..1751427
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
@@ -0,0 +1,21 @@
+_base_ = '../hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py'
+
+custom_hooks = [
+    dict(
+        mode='fixed_multi_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/local_search_config_step11.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py b/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
new file mode 100755
index 0000000..d8bc9e2
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_multi_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
@@ -0,0 +1,23 @@
+_base_ = '../res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py'
+
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(
+        type='RFSearchHook',
+        mode='fixed_multi_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/cascade_mask_rcnn_r2_101_fpn_20e_coco/local_search_config_step11.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=['stem', 'layer1'])))
+]
diff --git a/configs/rfnext/rfnext_fixed_multi_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py b/configs/rfnext/rfnext_fixed_multi_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
new file mode 100755
index 0000000..b98a8f9
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_multi_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
@@ -0,0 +1,46 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model setting
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PyramidVisionTransformerV2',
+        embed_dims=32,
+        num_layers=[2, 2, 2, 2],
+        init_cfg=dict(
+            checkpoint=  # noqa
+            'https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b0.pth'  # noqa
+        )),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 256],
+        out_channels=256,
+        num_outs=5))
+
+# optimizer
+optimizer = dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='fixed_multi_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/mask_rcnn_pvtv2-b0_fpn_1x_coco/local_search_config_step10.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=11,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_fixed_multi_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py b/configs/rfnext/rfnext_fixed_multi_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
new file mode 100755
index 0000000..1471f0a
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_multi_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = '../panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py'
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='fixed_multi_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/panoptic_fpn_r2_50_fpn_fp16_1x_coco/local_search_config_step10.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=11,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=['stem', 'layer1'])))
+]
diff --git a/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py b/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
new file mode 100755
index 0000000..19db83b
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
@@ -0,0 +1,22 @@
+_base_ = '../convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py'  # noqa
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='fixed_single_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/convnext_cascade_maskrcnn/local_search_config_step11.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py b/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
new file mode 100755
index 0000000..071f510
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
@@ -0,0 +1,22 @@
+_base_ = '../hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py'
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='fixed_single_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/local_search_config_step11.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py b/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
new file mode 100755
index 0000000..3813b38
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_single_branch_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
@@ -0,0 +1,23 @@
+_base_ = '../res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py'
+
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(
+        type='RFSearchHook',
+        mode='fixed_single_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/cascade_mask_rcnn_r2_101_fpn_20e_coco/local_search_config_step11.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=['stem', 'layer1'])))
+]
diff --git a/configs/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py b/configs/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
new file mode 100755
index 0000000..dc12384
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_single_branch_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
@@ -0,0 +1,46 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model setting
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PyramidVisionTransformerV2',
+        embed_dims=32,
+        num_layers=[2, 2, 2, 2],
+        init_cfg=dict(
+            checkpoint=  # noqa
+            'https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b0.pth'  # noqa
+        )),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 256],
+        out_channels=256,
+        num_outs=5))
+
+# optimizer
+optimizer = dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='fixed_single_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/mask_rcnn_pvtv2-b0_fpn_1x_coco/local_search_config_step10.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=11,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py b/configs/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
new file mode 100755
index 0000000..da45eba
--- /dev/null
+++ b/configs/rfnext/rfnext_fixed_single_branch_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = '../panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py'
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='fixed_single_branch',
+        rfstructure_file=  # noqa
+        './configs/rfnext/search_log/panoptic_fpn_r2_50_fpn_fp16_1x_coco/local_search_config_step10.json',  # noqa
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=11,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=['stem', 'layer1'])))
+]
diff --git a/configs/rfnext/rfnext_search_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py b/configs/rfnext/rfnext_search_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
new file mode 100755
index 0000000..3a99123
--- /dev/null
+++ b/configs/rfnext/rfnext_search_cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py
@@ -0,0 +1,21 @@
+_base_ = '../convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco.py'  # noqa
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='search',
+        rfstructure_file=None,
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py b/configs/rfnext/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
new file mode 100755
index 0000000..3a76939
--- /dev/null
+++ b/configs/rfnext/rfnext_search_cascade_mask_rcnn_hrnetv2p_w18_20e_coco.py
@@ -0,0 +1,21 @@
+_base_ = '../hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py'
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='search',
+        rfstructure_file=None,
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco.py b/configs/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
new file mode 100755
index 0000000..364ce28
--- /dev/null
+++ b/configs/rfnext/rfnext_search_cascade_mask_rcnn_r2_101_fpn_20e_coco.py
@@ -0,0 +1,22 @@
+_base_ = '../res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco.py'
+
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(
+        type='RFSearchHook',
+        mode='search',
+        rfstructure_file=None,
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=['stem', 'layer1'])))
+]
diff --git a/configs/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco.py b/configs/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
new file mode 100755
index 0000000..ee2e288
--- /dev/null
+++ b/configs/rfnext/rfnext_search_mask_rcnn_pvtv2-b0_fpn_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model setting
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PyramidVisionTransformerV2',
+        embed_dims=32,
+        num_layers=[2, 2, 2, 2],
+        init_cfg=dict(
+            checkpoint=  # noqa
+            'https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b0.pth'  # noqa
+        )),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 256],
+        out_channels=256,
+        num_outs=5))
+
+# optimizer
+optimizer = dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='search',
+        rfstructure_file=None,
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=11,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=[])))
+]
diff --git a/configs/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py b/configs/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
new file mode 100755
index 0000000..4c9816d
--- /dev/null
+++ b/configs/rfnext/rfnext_search_panoptic_fpn_r2_50_fpn_fp16_1x_coco.py
@@ -0,0 +1,21 @@
+_base_ = '../panoptic_fpn/panoptic_fpn_r2_50_fpn_fp16_1x_coco.py'
+
+custom_hooks = [
+    dict(
+        type='RFSearchHook',
+        mode='search',
+        rfstructure_file=None,
+        verbose=True,
+        by_epoch=True,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=11,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=['stem', 'layer1'])))
+]
diff --git a/configs/rfnext/search_log/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/local_search_config_step11.json b/configs/rfnext/search_log/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/local_search_config_step11.json
new file mode 100755
index 0000000..e5dfef7
--- /dev/null
+++ b/configs/rfnext/search_log/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/local_search_config_step11.json
@@ -0,0 +1,1133 @@
+{
+    "search": {
+        "step": 11,
+        "max_step": 12,
+        "search_interval": 1,
+        "exp_rate": 0.5,
+        "init_alphas": 0.01,
+        "mmin": 1,
+        "mmax": 24,
+        "num_branches": 2,
+        "skip_layer": [
+            "layer1"
+        ]
+    },
+    "structure": {
+        "module.backbone.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.transition1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.transition1.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.0.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage2.0.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.0.1.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage2.0.branches.0.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.0.2.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage2.0.branches.0.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.0.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.1.0.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage2.0.branches.1.0.conv2": [
+            4,
+            4
+        ],
+        "module.backbone.stage2.0.branches.1.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.1.1.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage2.0.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage2.0.fuse_layers.1.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.transition2.2.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.0.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.2.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.0.branches.2.0.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.0.branches.2.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.2.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.branches.2.2.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage3.0.branches.2.2.conv2": [
+            3,
+            3
+        ],
+        "module.backbone.stage3.0.branches.2.3.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage3.0.branches.2.3.conv2": [
+            7,
+            7
+        ],
+        "module.backbone.stage3.0.fuse_layers.1.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.fuse_layers.2.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.fuse_layers.2.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.0.fuse_layers.2.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.0.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.1.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.1.branches.1.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.1.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.1.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.2.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.1.branches.2.0.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.1.branches.2.1.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage3.1.branches.2.1.conv2": [
+            5,
+            5
+        ],
+        "module.backbone.stage3.1.branches.2.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.2.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.2.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.branches.2.3.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.1.fuse_layers.1.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.fuse_layers.2.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.fuse_layers.2.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.1.fuse_layers.2.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.0.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.2.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.0.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.0.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.0.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.0.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.0.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.branches.2.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.fuse_layers.1.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.fuse_layers.2.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.fuse_layers.2.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.2.fuse_layers.2.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.0.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.3.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.0.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.0.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.0.2.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.3.branches.0.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.0.3.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage3.3.branches.1.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.1.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.1.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.1.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.branches.2.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.fuse_layers.1.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.fuse_layers.2.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.fuse_layers.2.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage3.3.fuse_layers.2.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.transition3.3.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.0.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.0.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.0.2.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.branches.0.2.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.branches.0.3.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.branches.0.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.2.3.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.branches.3.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.3.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.3.1.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.branches.3.1.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.branches.3.2.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.0.branches.3.2.conv2": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.0.branches.3.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.branches.3.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.1.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.2.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.2.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.2.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.3.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.3.0.1.0": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.0.fuse_layers.3.0.2.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.3.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.3.1.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.0.fuse_layers.3.2.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.0.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.1.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.1.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.1.1.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.branches.1.1.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.2.3.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.branches.3.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.3.0.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.branches.3.1.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.branches.3.1.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.branches.3.2.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.branches.3.2.conv2": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.1.branches.3.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.branches.3.3.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.1.fuse_layers.1.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.2.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.2.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.2.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.3.0.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.3.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.3.0.2.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.3.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.3.1.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.1.fuse_layers.3.2.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.0.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.0.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.0.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.0.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.0.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.0.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.0.3.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.2.branches.0.3.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.2.branches.1.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.1.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.1.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.1.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.1.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.1.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.1.3.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.1.3.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.2.0.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.2.0.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.2.1.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.2.1.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.2.2.conv1": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.2.2.conv2": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.branches.2.3.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.2.branches.2.3.conv2": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.2.branches.3.0.conv1": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.2.branches.3.0.conv2": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.2.branches.3.1.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.2.branches.3.1.conv2": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.2.branches.3.2.conv1": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.2.branches.3.2.conv2": [
+            8,
+            8
+        ],
+        "module.backbone.stage4.2.branches.3.3.conv1": [
+            10,
+            10
+        ],
+        "module.backbone.stage4.2.branches.3.3.conv2": [
+            10,
+            10
+        ],
+        "module.backbone.stage4.2.fuse_layers.1.0.0.0": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.2.fuse_layers.2.0.0.0": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.2.fuse_layers.2.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.fuse_layers.2.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.fuse_layers.3.0.0.0": [
+            2,
+            2
+        ],
+        "module.backbone.stage4.2.fuse_layers.3.0.1.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.fuse_layers.3.0.2.0": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.2.fuse_layers.3.1.0.0": [
+            1,
+            1
+        ],
+        "module.backbone.stage4.2.fuse_layers.3.1.1.0": [
+            3,
+            3
+        ],
+        "module.backbone.stage4.2.fuse_layers.3.2.0.0": [
+            4,
+            4
+        ],
+        "module.neck.fpn_convs.0.conv": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.1.conv": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.2.conv": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.3.conv": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.4.conv": [
+            3,
+            3
+        ],
+        "module.rpn_head.rpn_conv": [
+            4,
+            4
+        ],
+        "module.roi_head.mask_head.0.convs.0.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.0.convs.1.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.0.convs.2.conv": [
+            3,
+            3
+        ],
+        "module.roi_head.mask_head.0.convs.3.conv": [
+            2,
+            2
+        ],
+        "module.roi_head.mask_head.1.convs.0.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.1.convs.1.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.1.convs.2.conv": [
+            2,
+            2
+        ],
+        "module.roi_head.mask_head.1.convs.3.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.2.convs.0.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.2.convs.1.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.2.convs.2.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.2.convs.3.conv": [
+            1,
+            1
+        ]
+    }
+}
diff --git a/configs/rfnext/search_log/cascade_mask_rcnn_r2_101_fpn_20e_coco/local_search_config_step11.json b/configs/rfnext/search_log/cascade_mask_rcnn_r2_101_fpn_20e_coco/local_search_config_step11.json
new file mode 100755
index 0000000..64757c6
--- /dev/null
+++ b/configs/rfnext/search_log/cascade_mask_rcnn_r2_101_fpn_20e_coco/local_search_config_step11.json
@@ -0,0 +1,124 @@
+{
+    "search": {
+        "step": 11,
+        "max_step": 12,
+        "search_interval": 1,
+        "exp_rate": 0.5,
+        "init_alphas": 0.01,
+        "mmin": 1,
+        "mmax": 24,
+        "num_branches": 2,
+        "skip_layer": [
+            "stem", "layer1"
+        ]
+    },
+    "structure": {
+        "module.backbone.layer2.0.convs.0": 1,
+        "module.backbone.layer2.0.convs.1": 1,
+        "module.backbone.layer2.0.convs.2": 1,
+        "module.backbone.layer2.1.convs.0": 1,
+        "module.backbone.layer2.1.convs.1": 1,
+        "module.backbone.layer2.1.convs.2": 1,
+        "module.backbone.layer2.2.convs.0": 1,
+        "module.backbone.layer2.2.convs.1": 1,
+        "module.backbone.layer2.2.convs.2": 1,
+        "module.backbone.layer2.3.convs.0": 1,
+        "module.backbone.layer2.3.convs.1": 1,
+        "module.backbone.layer2.3.convs.2": 1,
+        "module.backbone.layer3.0.convs.0": 1,
+        "module.backbone.layer3.0.convs.1": 1,
+        "module.backbone.layer3.0.convs.2": 1,
+        "module.backbone.layer3.1.convs.0": 2,
+        "module.backbone.layer3.1.convs.1": 1,
+        "module.backbone.layer3.1.convs.2": 2,
+        "module.backbone.layer3.2.convs.0": 2,
+        "module.backbone.layer3.2.convs.1": 1,
+        "module.backbone.layer3.2.convs.2": 2,
+        "module.backbone.layer3.3.convs.0": 2,
+        "module.backbone.layer3.3.convs.1": 2,
+        "module.backbone.layer3.3.convs.2": 2,
+        "module.backbone.layer3.4.convs.0": 1,
+        "module.backbone.layer3.4.convs.1": 2,
+        "module.backbone.layer3.4.convs.2": 2,
+        "module.backbone.layer3.5.convs.0": 2,
+        "module.backbone.layer3.5.convs.1": 2,
+        "module.backbone.layer3.5.convs.2": 2,
+        "module.backbone.layer3.6.convs.0": 2,
+        "module.backbone.layer3.6.convs.1": 2,
+        "module.backbone.layer3.6.convs.2": 3,
+        "module.backbone.layer3.7.convs.0": 1,
+        "module.backbone.layer3.7.convs.1": 1,
+        "module.backbone.layer3.7.convs.2": 2,
+        "module.backbone.layer3.8.convs.0": 1,
+        "module.backbone.layer3.8.convs.1": 2,
+        "module.backbone.layer3.8.convs.2": 3,
+        "module.backbone.layer3.9.convs.0": 1,
+        "module.backbone.layer3.9.convs.1": 1,
+        "module.backbone.layer3.9.convs.2": 3,
+        "module.backbone.layer3.10.convs.0": 1,
+        "module.backbone.layer3.10.convs.1": 2,
+        "module.backbone.layer3.10.convs.2": 2,
+        "module.backbone.layer3.11.convs.0": 1,
+        "module.backbone.layer3.11.convs.1": 1,
+        "module.backbone.layer3.11.convs.2": 2,
+        "module.backbone.layer3.12.convs.0": 1,
+        "module.backbone.layer3.12.convs.1": 1,
+        "module.backbone.layer3.12.convs.2": 2,
+        "module.backbone.layer3.13.convs.0": 2,
+        "module.backbone.layer3.13.convs.1": 2,
+        "module.backbone.layer3.13.convs.2": 2,
+        "module.backbone.layer3.14.convs.0": 2,
+        "module.backbone.layer3.14.convs.1": 2,
+        "module.backbone.layer3.14.convs.2": 2,
+        "module.backbone.layer3.15.convs.0": 2,
+        "module.backbone.layer3.15.convs.1": 2,
+        "module.backbone.layer3.15.convs.2": 2,
+        "module.backbone.layer3.16.convs.0": 3,
+        "module.backbone.layer3.16.convs.1": 4,
+        "module.backbone.layer3.16.convs.2": 3,
+        "module.backbone.layer3.17.convs.0": 10,
+        "module.backbone.layer3.17.convs.1": 3,
+        "module.backbone.layer3.17.convs.2": 9,
+        "module.backbone.layer3.18.convs.0": 3,
+        "module.backbone.layer3.18.convs.1": 6,
+        "module.backbone.layer3.18.convs.2": 3,
+        "module.backbone.layer3.19.convs.0": 1,
+        "module.backbone.layer3.19.convs.1": 1,
+        "module.backbone.layer3.19.convs.2": 2,
+        "module.backbone.layer3.20.convs.0": 2,
+        "module.backbone.layer3.20.convs.1": 2,
+        "module.backbone.layer3.20.convs.2": 1,
+        "module.backbone.layer3.21.convs.0": 2,
+        "module.backbone.layer3.21.convs.1": 1,
+        "module.backbone.layer3.21.convs.2": 1,
+        "module.backbone.layer3.22.convs.0": 2,
+        "module.backbone.layer3.22.convs.1": 3,
+        "module.backbone.layer3.22.convs.2": 1,
+        "module.backbone.layer4.0.convs.0": 3,
+        "module.backbone.layer4.0.convs.1": 3,
+        "module.backbone.layer4.0.convs.2": 3,
+        "module.backbone.layer4.1.convs.0": 1,
+        "module.backbone.layer4.1.convs.1": 2,
+        "module.backbone.layer4.1.convs.2": 3,
+        "module.backbone.layer4.2.convs.0": 1,
+        "module.backbone.layer4.2.convs.1": 3,
+        "module.backbone.layer4.2.convs.2": 8,
+        "module.neck.fpn_convs.0.conv": 1,
+        "module.neck.fpn_convs.1.conv": 1,
+        "module.neck.fpn_convs.2.conv": 1,
+        "module.neck.fpn_convs.3.conv": 1,
+        "module.rpn_head.rpn_conv": 3,
+        "module.roi_head.mask_head.0.convs.0.conv": 1,
+        "module.roi_head.mask_head.0.convs.1.conv": 1,
+        "module.roi_head.mask_head.0.convs.2.conv": 3,
+        "module.roi_head.mask_head.0.convs.3.conv": 1,
+        "module.roi_head.mask_head.1.convs.0.conv": 1,
+        "module.roi_head.mask_head.1.convs.1.conv": 1,
+        "module.roi_head.mask_head.1.convs.2.conv": 2,
+        "module.roi_head.mask_head.1.convs.3.conv": 1,
+        "module.roi_head.mask_head.2.convs.0.conv": 1,
+        "module.roi_head.mask_head.2.convs.1.conv": 1,
+        "module.roi_head.mask_head.2.convs.2.conv": 2,
+        "module.roi_head.mask_head.2.convs.3.conv": 1
+    }
+}
diff --git a/configs/rfnext/search_log/convnext_cascade_maskrcnn/local_search_config_step11.json b/configs/rfnext/search_log/convnext_cascade_maskrcnn/local_search_config_step11.json
new file mode 100755
index 0000000..fd115c2
--- /dev/null
+++ b/configs/rfnext/search_log/convnext_cascade_maskrcnn/local_search_config_step11.json
@@ -0,0 +1,62 @@
+{
+    "search": {
+        "step": 11,
+        "max_step": 12,
+        "search_interval": 1,
+        "exp_rate": 0.5,
+        "init_alphas": 0.01,
+        "mmin": 1,
+        "mmax": 24,
+        "num_branches": 2,
+        "skip_layer": []
+    },
+    "structure": {
+        "module.backbone.stages.0.0.depthwise_conv": 1,
+        "module.backbone.stages.0.1.depthwise_conv": 1,
+        "module.backbone.stages.0.2.depthwise_conv": 1,
+        "module.backbone.stages.1.0.depthwise_conv": 1,
+        "module.backbone.stages.1.1.depthwise_conv": 1,
+        "module.backbone.stages.1.2.depthwise_conv": 1,
+        "module.backbone.stages.2.0.depthwise_conv": 1,
+        "module.backbone.stages.2.1.depthwise_conv": 1,
+        "module.backbone.stages.2.2.depthwise_conv": 1,
+        "module.backbone.stages.2.3.depthwise_conv": 1,
+        "module.backbone.stages.2.4.depthwise_conv": 1,
+        "module.backbone.stages.2.5.depthwise_conv": 1,
+        "module.backbone.stages.2.6.depthwise_conv": 2,
+        "module.backbone.stages.2.7.depthwise_conv": 1,
+        "module.backbone.stages.2.8.depthwise_conv": 1,
+        "module.backbone.stages.3.0.depthwise_conv": 2,
+        "module.backbone.stages.3.1.depthwise_conv": 2,
+        "module.backbone.stages.3.2.depthwise_conv": 2,
+        "module.neck.fpn_convs.0.conv": 1,
+        "module.neck.fpn_convs.1.conv": 1,
+        "module.neck.fpn_convs.2.conv": 1,
+        "module.neck.fpn_convs.3.conv": 1,
+        "module.rpn_head.rpn_conv": 3,
+        "module.roi_head.bbox_head.0.shared_convs.0.conv": 1,
+        "module.roi_head.bbox_head.0.shared_convs.1.conv": 1,
+        "module.roi_head.bbox_head.0.shared_convs.2.conv": 1,
+        "module.roi_head.bbox_head.0.shared_convs.3.conv": 2,
+        "module.roi_head.bbox_head.1.shared_convs.0.conv": 1,
+        "module.roi_head.bbox_head.1.shared_convs.1.conv": 2,
+        "module.roi_head.bbox_head.1.shared_convs.2.conv": 1,
+        "module.roi_head.bbox_head.1.shared_convs.3.conv": 1,
+        "module.roi_head.bbox_head.2.shared_convs.0.conv": 1,
+        "module.roi_head.bbox_head.2.shared_convs.1.conv": 2,
+        "module.roi_head.bbox_head.2.shared_convs.2.conv": 2,
+        "module.roi_head.bbox_head.2.shared_convs.3.conv": 1,
+        "module.roi_head.mask_head.0.convs.0.conv": 1,
+        "module.roi_head.mask_head.0.convs.1.conv": 3,
+        "module.roi_head.mask_head.0.convs.2.conv": 3,
+        "module.roi_head.mask_head.0.convs.3.conv": 2,
+        "module.roi_head.mask_head.1.convs.0.conv": 1,
+        "module.roi_head.mask_head.1.convs.1.conv": 3,
+        "module.roi_head.mask_head.1.convs.2.conv": 2,
+        "module.roi_head.mask_head.1.convs.3.conv": 1,
+        "module.roi_head.mask_head.2.convs.0.conv": 1,
+        "module.roi_head.mask_head.2.convs.1.conv": 2,
+        "module.roi_head.mask_head.2.convs.2.conv": 2,
+        "module.roi_head.mask_head.2.convs.3.conv": 1
+    }
+}
diff --git a/configs/rfnext/search_log/mask_rcnn_pvtv2-b0_fpn_1x_coco/local_search_config_step10.json b/configs/rfnext/search_log/mask_rcnn_pvtv2-b0_fpn_1x_coco/local_search_config_step10.json
new file mode 100755
index 0000000..f7ad2e8
--- /dev/null
+++ b/configs/rfnext/search_log/mask_rcnn_pvtv2-b0_fpn_1x_coco/local_search_config_step10.json
@@ -0,0 +1,99 @@
+{
+    "search": {
+        "step": 10,
+        "max_step": 11,
+        "search_interval": 1,
+        "exp_rate": 0.5,
+        "init_alphas": 0.01,
+        "mmin": 1,
+        "mmax": 24,
+        "num_branches": 2,
+        "skip_layer": []
+    },
+    "structure": {
+        "module.backbone.layers.0.0.projection": [
+            1,
+            1
+        ],
+        "module.backbone.layers.0.1.0.ffn.layers.1": [
+            1,
+            1
+        ],
+        "module.backbone.layers.0.1.1.ffn.layers.1": [
+            1,
+            1
+        ],
+        "module.backbone.layers.1.0.projection": [
+            1,
+            1
+        ],
+        "module.backbone.layers.1.1.0.ffn.layers.1": [
+            1,
+            1
+        ],
+        "module.backbone.layers.1.1.1.ffn.layers.1": [
+            1,
+            1
+        ],
+        "module.backbone.layers.2.0.projection": [
+            1,
+            1
+        ],
+        "module.backbone.layers.2.1.0.ffn.layers.1": [
+            1,
+            1
+        ],
+        "module.backbone.layers.2.1.1.ffn.layers.1": [
+            1,
+            1
+        ],
+        "module.backbone.layers.3.0.projection": [
+            1,
+            1
+        ],
+        "module.backbone.layers.3.1.0.ffn.layers.1": [
+            2,
+            2
+        ],
+        "module.backbone.layers.3.1.1.ffn.layers.1": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.0.conv": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.1.conv": [
+            2,
+            2
+        ],
+        "module.neck.fpn_convs.2.conv": [
+            2,
+            2
+        ],
+        "module.neck.fpn_convs.3.conv": [
+            2,
+            2
+        ],
+        "module.rpn_head.rpn_conv": [
+            3,
+            3
+        ],
+        "module.roi_head.mask_head.convs.0.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.convs.1.conv": [
+            2,
+            2
+        ],
+        "module.roi_head.mask_head.convs.2.conv": [
+            2,
+            2
+        ],
+        "module.roi_head.mask_head.convs.3.conv": [
+            2,
+            2
+        ]
+    }
+}
diff --git a/configs/rfnext/search_log/panoptic_fpn_r2_50_fpn_fp16_1x_coco/local_search_config_step10.json b/configs/rfnext/search_log/panoptic_fpn_r2_50_fpn_fp16_1x_coco/local_search_config_step10.json
new file mode 100755
index 0000000..dce0275
--- /dev/null
+++ b/configs/rfnext/search_log/panoptic_fpn_r2_50_fpn_fp16_1x_coco/local_search_config_step10.json
@@ -0,0 +1,238 @@
+{
+    "search": {
+        "step": 10,
+        "max_step": 11,
+        "search_interval": 1,
+        "exp_rate": 0.5,
+        "init_alphas": 0.01,
+        "mmin": 1,
+        "mmax": 24,
+        "num_branches": 2,
+        "skip_layer": [
+            "stem",
+            "layer1"
+        ]
+    },
+    "structure": {
+        "module.backbone.layer2.0.convs.0": [
+            1,
+            1
+        ],
+        "module.backbone.layer2.0.convs.1": [
+            1,
+            1
+        ],
+        "module.backbone.layer2.0.convs.2": [
+            1,
+            1
+        ],
+        "module.backbone.layer2.1.convs.0": [
+            2,
+            2
+        ],
+        "module.backbone.layer2.1.convs.1": [
+            2,
+            2
+        ],
+        "module.backbone.layer2.1.convs.2": [
+            2,
+            2
+        ],
+        "module.backbone.layer2.2.convs.0": [
+            1,
+            1
+        ],
+        "module.backbone.layer2.2.convs.1": [
+            1,
+            1
+        ],
+        "module.backbone.layer2.2.convs.2": [
+            2,
+            2
+        ],
+        "module.backbone.layer2.3.convs.0": [
+            1,
+            1
+        ],
+        "module.backbone.layer2.3.convs.1": [
+            1,
+            1
+        ],
+        "module.backbone.layer2.3.convs.2": [
+            1,
+            1
+        ],
+        "module.backbone.layer3.0.convs.0": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.0.convs.1": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.0.convs.2": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.1.convs.0": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.1.convs.1": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.1.convs.2": [
+            3,
+            3
+        ],
+        "module.backbone.layer3.2.convs.0": [
+            1,
+            1
+        ],
+        "module.backbone.layer3.2.convs.1": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.2.convs.2": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.3.convs.0": [
+            3,
+            3
+        ],
+        "module.backbone.layer3.3.convs.1": [
+            3,
+            3
+        ],
+        "module.backbone.layer3.3.convs.2": [
+            3,
+            3
+        ],
+        "module.backbone.layer3.4.convs.0": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.4.convs.1": [
+            1,
+            1
+        ],
+        "module.backbone.layer3.4.convs.2": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.5.convs.0": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.5.convs.1": [
+            2,
+            2
+        ],
+        "module.backbone.layer3.5.convs.2": [
+            3,
+            3
+        ],
+        "module.backbone.layer4.0.convs.0": [
+            3,
+            3
+        ],
+        "module.backbone.layer4.0.convs.1": [
+            3,
+            3
+        ],
+        "module.backbone.layer4.0.convs.2": [
+            3,
+            3
+        ],
+        "module.backbone.layer4.1.convs.0": [
+            3,
+            3
+        ],
+        "module.backbone.layer4.1.convs.1": [
+            8,
+            8
+        ],
+        "module.backbone.layer4.1.convs.2": [
+            10,
+            10
+        ],
+        "module.backbone.layer4.2.convs.0": [
+            3,
+            3
+        ],
+        "module.backbone.layer4.2.convs.1": [
+            2,
+            2
+        ],
+        "module.backbone.layer4.2.convs.2": [
+            5,
+            5
+        ],
+        "module.neck.fpn_convs.0.conv": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.1.conv": [
+            2,
+            2
+        ],
+        "module.neck.fpn_convs.2.conv": [
+            1,
+            1
+        ],
+        "module.neck.fpn_convs.3.conv": [
+            2,
+            2
+        ],
+        "module.rpn_head.rpn_conv": [
+            3,
+            3
+        ],
+        "module.roi_head.mask_head.convs.0.conv": [
+            1,
+            1
+        ],
+        "module.roi_head.mask_head.convs.1.conv": [
+            2,
+            2
+        ],
+        "module.roi_head.mask_head.convs.2.conv": [
+            2,
+            2
+        ],
+        "module.roi_head.mask_head.convs.3.conv": [
+            1,
+            1
+        ],
+        "module.semantic_head.conv_upsample_layers.0.conv.0.conv": [
+            2,
+            2
+        ],
+        "module.semantic_head.conv_upsample_layers.1.conv.0.conv": [
+            6,
+            6
+        ],
+        "module.semantic_head.conv_upsample_layers.2.conv.0.conv": [
+            2,
+            2
+        ],
+        "module.semantic_head.conv_upsample_layers.2.conv.1.conv": [
+            1,
+            1
+        ],
+        "module.semantic_head.conv_upsample_layers.3.conv.0.conv": [
+            5,
+            5
+        ],
+        "module.semantic_head.conv_upsample_layers.3.conv.1.conv": [
+            3,
+            3
+        ],
+        "module.semantic_head.conv_upsample_layers.3.conv.2.conv": [
+            1,
+            1
+        ]
+    }
+}
diff --git a/configs/rpn/README.md b/configs/rpn/README.md
new file mode 100755
index 0000000..99addc0
--- /dev/null
+++ b/configs/rpn/README.md
@@ -0,0 +1,39 @@
+# RPN
+
+> [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143973617-387c7561-82f4-40b2-b78e-4776394b1b8b.png" height="300"/>
+</div>
+
+## Results and Models
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | AR1000 |                                                  Config                                                   |                                                                                                                                       Download                                                                                                                                        |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     |  caffe  |   1x    |   3.5    |      22.6      |  58.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r50_caffe_fpn_1x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531-5b903a37.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531_012334.log.json)   |
+|    R-50-FPN     | pytorch |   1x    |   3.8    |      22.3      |  58.2  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r50_fpn_1x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218_151240.log.json)               |
+|    R-50-FPN     | pytorch |   2x    |    -     |       -        |  58.6  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r50_fpn_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131-0728c9b3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131_190631.log.json)               |
+|    R-101-FPN    |  caffe  |   1x    |   5.4    |      17.3      |  60.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r101_caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531-0629a2e2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531_012345.log.json) |
+|    R-101-FPN    | pytorch |   1x    |   5.8    |      16.5      |  59.7  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r101_fpn_1x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_1x_coco/rpn_r101_fpn_1x_coco_20200131-2ace2249.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_1x_coco/rpn_r101_fpn_1x_coco_20200131_191000.log.json)             |
+|    R-101-FPN    | pytorch |   2x    |    -     |       -        |  60.2  |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_r101_fpn_2x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_2x_coco/rpn_r101_fpn_2x_coco_20200131-24e3db1a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_2x_coco/rpn_r101_fpn_2x_coco_20200131_191106.log.json)             |
+| X-101-32x4d-FPN | pytorch |   1x    |   7.0    |      13.0      |  60.6  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219-b02646c6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219_012037.log.json) |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |       -        |  61.1  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208-d22bd0bb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208_200752.log.json) |
+| X-101-64x4d-FPN | pytorch |   1x    |   10.1   |      9.1       |  61.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208-cde6f7dd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208_200752.log.json) |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |       -        |  61.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/rpn/rpn_x101_64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208-c65f524f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208_200752.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{ren2015faster,
+  title={Faster r-cnn: Towards real-time object detection with region proposal networks},
+  author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
+  booktitle={Advances in neural information processing systems},
+  year={2015}
+}
+```
diff --git a/configs/rpn/rpn_r101_caffe_fpn_1x_coco.py b/configs/rpn/rpn_r101_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..27be946
--- /dev/null
+++ b/configs/rpn/rpn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './rpn_r50_caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/configs/rpn/rpn_r101_fpn_1x_coco.py b/configs/rpn/rpn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..962728f
--- /dev/null
+++ b/configs/rpn/rpn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/rpn/rpn_r101_fpn_2x_coco.py b/configs/rpn/rpn_r101_fpn_2x_coco.py
new file mode 100755
index 0000000..ac7671c
--- /dev/null
+++ b/configs/rpn/rpn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './rpn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/rpn/rpn_r50_caffe_c4_1x_coco.py b/configs/rpn/rpn_r50_caffe_c4_1x_coco.py
new file mode 100755
index 0000000..6da0ee9
--- /dev/null
+++ b/configs/rpn/rpn_r50_caffe_c4_1x_coco.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/rpn_r50_caffe_c4.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# dataset settings
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='proposal_fast')
diff --git a/configs/rpn/rpn_r50_caffe_fpn_1x_coco.py b/configs/rpn/rpn_r50_caffe_fpn_1x_coco.py
new file mode 100755
index 0000000..68c36fa
--- /dev/null
+++ b/configs/rpn/rpn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,41 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/rpn/rpn_r50_fpn_1x_coco.py b/configs/rpn/rpn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..26f95a3
--- /dev/null
+++ b/configs/rpn/rpn_r50_fpn_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/rpn_r50_fpn.py', '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+evaluation = dict(interval=1, metric='proposal_fast')
diff --git a/configs/rpn/rpn_r50_fpn_2x_coco.py b/configs/rpn/rpn_r50_fpn_2x_coco.py
new file mode 100755
index 0000000..2f264bf
--- /dev/null
+++ b/configs/rpn/rpn_r50_fpn_2x_coco.py
@@ -0,0 +1,5 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/rpn/rpn_x101_32x4d_fpn_1x_coco.py b/configs/rpn/rpn_x101_32x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..d0c7394
--- /dev/null
+++ b/configs/rpn/rpn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/rpn/rpn_x101_32x4d_fpn_2x_coco.py b/configs/rpn/rpn_x101_32x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..c6880b7
--- /dev/null
+++ b/configs/rpn/rpn_x101_32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/rpn/rpn_x101_64x4d_fpn_1x_coco.py b/configs/rpn/rpn_x101_64x4d_fpn_1x_coco.py
new file mode 100755
index 0000000..96e691a
--- /dev/null
+++ b/configs/rpn/rpn_x101_64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/rpn/rpn_x101_64x4d_fpn_2x_coco.py b/configs/rpn/rpn_x101_64x4d_fpn_2x_coco.py
new file mode 100755
index 0000000..4182a39
--- /dev/null
+++ b/configs/rpn/rpn_x101_64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/sabl/README.md b/configs/sabl/README.md
new file mode 100755
index 0000000..03992be
--- /dev/null
+++ b/configs/sabl/README.md
@@ -0,0 +1,47 @@
+# SABL
+
+> [Side-Aware Boundary Localization for More Precise Object Detection](https://arxiv.org/abs/1912.04260)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Current object detection frameworks mainly rely on bounding box regression to localize objects. Despite the remarkable progress in recent years, the precision of bounding box regression remains unsatisfactory, hence limiting performance in object detection. We observe that precise localization requires careful placement of each side of the bounding box. However, the mainstream approach, which focuses on predicting centers and sizes, is not the most effective way to accomplish this task, especially when there exists displacements with large variance between the anchors and the targets. In this paper, we propose an alternative approach, named as Side-Aware Boundary Localization (SABL), where each side of the bounding box is respectively localized with a dedicated network branch. To tackle the difficulty of precise localization in the presence of displacements with large variance, we further propose a two-step localization scheme, which first predicts a range of movement through bucket prediction and then pinpoints the precise position within the predicted bucket. We test the proposed method on both two-stage and single-stage detection frameworks. Replacing the standard bounding box regression branch with the proposed design leads to significant improvements on Faster R-CNN, RetinaNet, and Cascade R-CNN, by 3.0%, 1.7%, and 0.9%, respectively.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143973698-3dfaea91-4415-4818-9781-5017183e7489.png"/>
+</div>
+
+## Results and Models
+
+The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val).
+Single-scale testing (1333x800) is adopted in all results.
+
+|       Method       | Backbone  | Lr schd | ms-train | box AP |                                                       Config                                                       |                                                                                                                                  Download                                                                                                                                   |
+| :----------------: | :-------: | :-----: | :------: | :----: | :----------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| SABL Faster R-CNN  | R-50-FPN  |   1x    |    N     |  39.9  |  [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py)  |    [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/20200830_130324.log.json)    |
+| SABL Faster R-CNN  | R-101-FPN |   1x    |    N     |  41.7  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py)  |  [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/sabl_faster_rcnn_r101_fpn_1x_coco-f804c6c1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/20200830_183949.log.json)   |
+| SABL Cascade R-CNN | R-50-FPN  |   1x    |    N     |  41.6  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py)  |  [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/sabl_cascade_rcnn_r50_fpn_1x_coco-e1748e5e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/20200831_033726.log.json)   |
+| SABL Cascade R-CNN | R-101-FPN |   1x    |    N     |  43.0  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/sabl_cascade_rcnn_r101_fpn_1x_coco-2b83e87c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/20200831_141745.log.json) |
+
+|     Method     | Backbone  | GN  | Lr schd |  ms-train   | box AP |                                                            Config                                                             |                                                                                                                                                   Download                                                                                                                                                   |
+| :------------: | :-------: | :-: | :-----: | :---------: | :----: | :---------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| SABL RetinaNet | R-50-FPN  |  N  |   1x    |      N      |  37.7  |        [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py)         |                       [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/20200830_053451.log.json)                        |
+| SABL RetinaNet | R-50-FPN  |  Y  |   1x    |      N      |  38.8  |       [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py)       |                   [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/sabl_retinanet_r50_fpn_gn_1x_coco-e16dfcf1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/20200831_141955.log.json)                   |
+| SABL RetinaNet | R-101-FPN |  N  |   1x    |      N      |  39.7  |        [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_1x_coco.py)        |                      [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/sabl_retinanet_r101_fpn_1x_coco-42026904.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/20200831_034256.log.json)                      |
+| SABL RetinaNet | R-101-FPN |  Y  |   1x    |      N      |  40.5  |      [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py)       |                 [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/sabl_retinanet_r101_fpn_gn_1x_coco-40a893e8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/20200830_201422.log.json)                  |
+| SABL RetinaNet | R-101-FPN |  Y  |   2x    | Y (640~800) |  42.9  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco-1e63382c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/20200830_144807.log.json) |
+| SABL RetinaNet | R-101-FPN |  Y  |   2x    | Y (480~960) |  43.6  | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco-5342f857.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/20200830_164537.log.json) |
+
+## Citation
+
+We provide config files to reproduce the object detection results in the ECCV 2020 Spotlight paper for [Side-Aware Boundary Localization for More Precise Object Detection](https://arxiv.org/abs/1912.04260).
+
+```latex
+@inproceedings{Wang_2020_ECCV,
+    title = {Side-Aware Boundary Localization for More Precise Object Detection},
+    author = {Jiaqi Wang and Wenwei Zhang and Yuhang Cao and Kai Chen and Jiangmiao Pang and Tao Gong and Jianping Shi and Chen Change Loy and Dahua Lin},
+    booktitle = {ECCV},
+    year = {2020}
+}
+```
diff --git a/configs/sabl/metafile.yml b/configs/sabl/metafile.yml
new file mode 100755
index 0000000..23c51cf
--- /dev/null
+++ b/configs/sabl/metafile.yml
@@ -0,0 +1,140 @@
+Collections:
+  - Name: SABL
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - SABL
+    Paper:
+      URL: https://arxiv.org/abs/1912.04260
+      Title: 'Side-Aware Boundary Localization for More Precise Object Detection'
+    README: configs/sabl/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/roi_heads/bbox_heads/sabl_head.py#L14
+      Version: v2.4.0
+
+Models:
+  - Name: sabl_faster_rcnn_r50_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth
+
+  - Name: sabl_faster_rcnn_r101_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/sabl_faster_rcnn_r101_fpn_1x_coco-f804c6c1.pth
+
+  - Name: sabl_cascade_rcnn_r50_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/sabl_cascade_rcnn_r50_fpn_1x_coco-e1748e5e.pth
+
+  - Name: sabl_cascade_rcnn_r101_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/sabl_cascade_rcnn_r101_fpn_1x_coco-2b83e87c.pth
+
+  - Name: sabl_retinanet_r50_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth
+
+  - Name: sabl_retinanet_r50_fpn_gn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/sabl_retinanet_r50_fpn_gn_1x_coco-e16dfcf1.pth
+
+  - Name: sabl_retinanet_r101_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_retinanet_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/sabl_retinanet_r101_fpn_1x_coco-42026904.pth
+
+  - Name: sabl_retinanet_r101_fpn_gn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/sabl_retinanet_r101_fpn_gn_1x_coco-40a893e8.pth
+
+  - Name: sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco-1e63382c.pth
+
+  - Name: sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco-5342f857.pth
diff --git a/configs/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py b/configs/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..64fe230
--- /dev/null
+++ b/configs/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,90 @@
+_base_ = [
+    '../_base_/models/cascade_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(bbox_head=[
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0))
+    ]))
diff --git a/configs/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py b/configs/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..4b28a59
--- /dev/null
+++ b/configs/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,86 @@
+_base_ = [
+    '../_base_/models/cascade_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    roi_head=dict(bbox_head=[
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0))
+    ]))
diff --git a/configs/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py b/configs/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..e48d425
--- /dev/null
+++ b/configs/sabl/sabl_faster_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(
+        bbox_head=dict(
+            _delete_=True,
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0))))
diff --git a/configs/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py b/configs/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..732c7ba
--- /dev/null
+++ b/configs/sabl/sabl_faster_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            _delete_=True,
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0))))
diff --git a/configs/sabl/sabl_retinanet_r101_fpn_1x_coco.py b/configs/sabl/sabl_retinanet_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..b08e916
--- /dev/null
+++ b/configs/sabl/sabl_retinanet_r101_fpn_1x_coco.py
@@ -0,0 +1,54 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py b/configs/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py
new file mode 100755
index 0000000..fc30d63
--- /dev/null
+++ b/configs/sabl/sabl_retinanet_r101_fpn_gn_1x_coco.py
@@ -0,0 +1,56 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py b/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py
new file mode 100755
index 0000000..e8fe166
--- /dev/null
+++ b/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco.py
@@ -0,0 +1,73 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 480), (1333, 960)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py b/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py
new file mode 100755
index 0000000..30c4339
--- /dev/null
+++ b/configs/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco.py
@@ -0,0 +1,73 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py b/configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..6fe6bd6
--- /dev/null
+++ b/configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,50 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py b/configs/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py
new file mode 100755
index 0000000..6acf080
--- /dev/null
+++ b/configs/sabl/sabl_retinanet_r50_fpn_gn_1x_coco.py
@@ -0,0 +1,52 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/scnet/README.md b/configs/scnet/README.md
new file mode 100755
index 0000000..773874a
--- /dev/null
+++ b/configs/scnet/README.md
@@ -0,0 +1,63 @@
+# SCNet
+
+> [SCNet: Training Inference Sample Consistency for Instance Segmentation](https://arxiv.org/abs/2012.10150)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Cascaded architectures have brought significant performance improvement in object detection and instance segmentation. However, there are lingering issues regarding the disparity in the Intersection-over-Union (IoU) distribution of the samples between training and inference. This disparity can potentially exacerbate detection accuracy. This paper proposes an architecture referred to as Sample Consistency Network (SCNet) to ensure that the IoU distribution of the samples at training time is close to that at inference time. Furthermore, SCNet incorporates feature relay and utilizes global contextual information to further reinforce the reciprocal relationships among classifying, detecting, and segmenting sub-tasks. Extensive experiments on the standard COCO dataset reveal the effectiveness of the proposed method over multiple evaluation metrics, including box AP, mask AP, and inference speed. In particular, while running 38% faster, the proposed SCNet improves the AP of the box and mask predictions by respectively 1.3 and 2.3 points compared to the strong Cascade Mask R-CNN baseline.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143974840-8fed75f3-661e-4e2a-a210-acf4ab5f42a3.png"/>
+</div>
+
+## Dataset
+
+SCNet requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path.
+The directory should be like this.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+|   |   ├── stuffthingmaps
+```
+
+## Results and Models
+
+The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val)
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf speed (fps) | box AP | mask AP | TTA box AP | TTA mask AP |                                                     Config                                                     |                                                                                                                                           Download                                                                                                                                           |
+| :-------------: | :-----: | :-----: | :------: | :-------------: | :----: | :-----: | :--------: | :---------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |   1x    |   7.0    |       6.2       |  43.5  |  39.2   |    44.8    |    40.9     |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r50_fpn_1x_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco_20210117_192725.log.json)                 |
+|    R-50-FPN     | pytorch |   20e   |   7.0    |       6.2       |  44.5  |  40.0   |    45.8    |    41.5     |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r50_fpn_20e_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco-a569f645.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco_20210116_060148.log.json)               |
+|    R-101-FPN    | pytorch |   20e   |   8.9    |       5.8       |  45.8  |  40.9   |    47.3    |    42.7     |    [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r101_fpn_20e_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco-294e312c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco_20210118_175824.log.json)             |
+| X-101-64x4d-FPN | pytorch |   20e   |   13.2   |       4.9       |  47.5  |  42.3   |    48.9    |    44.0     | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_x101_64x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco-fb09dec9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco_20210120_045959.log.json) |
+
+### Notes
+
+- Training hyper-parameters are identical to those of [HTC](https://github.com/open-mmlab/mmdetection/tree/master/configs/htc).
+- TTA means Test Time Augmentation, which applies horizontal flip and multi-scale testing. Refer to [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scnet/scnet_r50_fpn_1x_coco.py).
+
+## Citation
+
+We provide the code for reproducing experiment results of [SCNet](https://arxiv.org/abs/2012.10150).
+
+```latex
+@inproceedings{vu2019cascade,
+  title={SCNet: Training Inference Sample Consistency for Instance Segmentation},
+  author={Vu, Thang and Haeyong, Kang and Yoo, Chang D},
+  booktitle={AAAI},
+  year={2021}
+}
+```
diff --git a/configs/scnet/metafile.yml b/configs/scnet/metafile.yml
new file mode 100755
index 0000000..15eaebf
--- /dev/null
+++ b/configs/scnet/metafile.yml
@@ -0,0 +1,116 @@
+Collections:
+  - Name: SCNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - SCNet
+    Paper:
+      URL: https://arxiv.org/abs/2012.10150
+      Title: 'SCNet: Training Inference Sample Consistency for Instance Segmentation'
+    README: configs/scnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/scnet.py#L6
+      Version: v2.9.0
+
+Models:
+  - Name: scnet_r50_fpn_1x_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 161.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth
+
+  - Name: scnet_r50_fpn_20e_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 161.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco-a569f645.pth
+
+  - Name: scnet_r101_fpn_20e_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 8.9
+      inference time (ms/im):
+        - value: 172.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco-294e312c.pth
+
+  - Name: scnet_x101_64x4d_fpn_20e_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_x101_64x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 13.2
+      inference time (ms/im):
+        - value: 204.08
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco-fb09dec9.pth
diff --git a/configs/scnet/scnet_r101_fpn_20e_coco.py b/configs/scnet/scnet_r101_fpn_20e_coco.py
new file mode 100755
index 0000000..ebba529
--- /dev/null
+++ b/configs/scnet/scnet_r101_fpn_20e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './scnet_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/scnet/scnet_r50_fpn_1x_coco.py b/configs/scnet/scnet_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..fe03b0d
--- /dev/null
+++ b/configs/scnet/scnet_r50_fpn_1x_coco.py
@@ -0,0 +1,136 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    type='SCNet',
+    roi_head=dict(
+        _delete_=True,
+        type='SCNetRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='SCNetBBoxHead',
+                num_shared_fcs=2,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='SCNetBBoxHead',
+                num_shared_fcs=2,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='SCNetBBoxHead',
+                num_shared_fcs=2,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='SCNetMaskHead',
+            num_convs=12,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            conv_to_res=True,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+        semantic_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[8]),
+        semantic_head=dict(
+            type='SCNetSemanticHead',
+            num_ins=5,
+            fusion_level=1,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=183,
+            loss_seg=dict(
+                type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2),
+            conv_to_res=True),
+        glbctx_head=dict(
+            type='GlobalContextHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_weight=3.0,
+            conv_to_res=True),
+        feat_relay_head=dict(
+            type='FeatureRelayHead',
+            in_channels=1024,
+            out_conv_channels=256,
+            roi_feat_size=7,
+            scale_factor=2)))
+
+# uncomment below code to enable test time augmentations
+# img_norm_cfg = dict(
+#     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# test_pipeline = [
+#     dict(type='LoadImageFromFile'),
+#     dict(
+#         type='MultiScaleFlipAug',
+#         img_scale=[(600, 900), (800, 1200), (1000, 1500), (1200, 1800),
+#                    (1400, 2100)],
+#         flip=True,
+#         transforms=[
+#             dict(type='Resize', keep_ratio=True),
+#             dict(type='RandomFlip', flip_ratio=0.5),
+#             dict(type='Normalize', **img_norm_cfg),
+#             dict(type='Pad', size_divisor=32),
+#             dict(type='ImageToTensor', keys=['img']),
+#             dict(type='Collect', keys=['img']),
+#         ])
+# ]
+# data = dict(
+#     val=dict(pipeline=test_pipeline),
+#     test=dict(pipeline=test_pipeline))
diff --git a/configs/scnet/scnet_r50_fpn_20e_coco.py b/configs/scnet/scnet_r50_fpn_20e_coco.py
new file mode 100755
index 0000000..3b121a6
--- /dev/null
+++ b/configs/scnet/scnet_r50_fpn_20e_coco.py
@@ -0,0 +1,4 @@
+_base_ = './scnet_r50_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/configs/scnet/scnet_x101_64x4d_fpn_20e_coco.py b/configs/scnet/scnet_x101_64x4d_fpn_20e_coco.py
new file mode 100755
index 0000000..1e54b03
--- /dev/null
+++ b/configs/scnet/scnet_x101_64x4d_fpn_20e_coco.py
@@ -0,0 +1,15 @@
+_base_ = './scnet_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/scnet/scnet_x101_64x4d_fpn_8x1_20e_coco.py b/configs/scnet/scnet_x101_64x4d_fpn_8x1_20e_coco.py
new file mode 100755
index 0000000..be8ddc5
--- /dev/null
+++ b/configs/scnet/scnet_x101_64x4d_fpn_8x1_20e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './scnet_x101_64x4d_fpn_20e_coco.py'
+data = dict(samples_per_gpu=1, workers_per_gpu=1)
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/scratch/README.md b/configs/scratch/README.md
new file mode 100755
index 0000000..189f181
--- /dev/null
+++ b/configs/scratch/README.md
@@ -0,0 +1,35 @@
+# Scratch
+
+> [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We report competitive results on object detection and instance segmentation on the COCO dataset using standard models trained from random initialization. The results are no worse than their ImageNet pre-training counterparts even when using the hyper-parameters of the baseline system (Mask R-CNN) that were optimized for fine-tuning pre-trained models, with the sole exception of increasing the number of training iterations so the randomly initialized models may converge. Training from random initialization is surprisingly robust; our results hold even when: (i) using only 10% of the training data, (ii) for deeper and wider models, and (iii) for multiple tasks and metrics. Experiments show that ImageNet pre-training speeds up convergence early in training, but does not necessarily provide regularization or improve final target task accuracy. To push the envelope we demonstrate 50.9 AP on COCO object detection without using any external data---a result on par with the top COCO 2017 competition results that used ImageNet pre-training. These observations challenge the conventional wisdom of ImageNet pre-training for dependent tasks and we expect these discoveries will encourage people to rethink the current de facto paradigm of \`pre-training and fine-tuning' in computer vision.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143974572-69c4f57d-0d6d-4f56-ba91-23f8a65a2a77.png" height="300"/>
+</div>
+
+## Results and Models
+
+|    Model     | Backbone |  Style  | Lr schd | box AP | mask AP |                                                            Config                                                             |                                                                                                                                                                                 Download                                                                                                                                                                                  |
+| :----------: | :------: | :-----: | :-----: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Faster R-CNN | R-50-FPN | pytorch |   6x    |  40.7  |         | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py) |     [model](https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_bbox_mAP-0.407_20200201_193013-90813d01.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_20200201_193013.log.json)     |
+|  Mask R-CNN  | R-50-FPN | pytorch |   6x    |  41.2  |  37.4   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py)  | [model](https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_20200201_193051.log.json) |
+
+Note:
+
+- The above models are trained with 16 GPUs.
+
+## Citation
+
+```latex
+@article{he2018rethinking,
+  title={Rethinking imagenet pre-training},
+  author={He, Kaiming and Girshick, Ross and Doll{\'a}r, Piotr},
+  journal={arXiv preprint arXiv:1811.08883},
+  year={2018}
+}
+```
diff --git a/configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py b/configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py
new file mode 100755
index 0000000..55aa3a6
--- /dev/null
+++ b/configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        frozen_stages=-1,
+        zero_init_residual=False,
+        norm_cfg=norm_cfg,
+        init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg)))
+# optimizer
+optimizer = dict(paramwise_cfg=dict(norm_decay_mult=0))
+optimizer_config = dict(_delete_=True, grad_clip=None)
+# learning policy
+lr_config = dict(warmup_ratio=0.1, step=[65, 71])
+runner = dict(type='EpochBasedRunner', max_epochs=73)
diff --git a/configs/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py b/configs/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py
new file mode 100755
index 0000000..cc52cb8
--- /dev/null
+++ b/configs/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        frozen_stages=-1,
+        zero_init_residual=False,
+        norm_cfg=norm_cfg,
+        init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+# optimizer
+optimizer = dict(paramwise_cfg=dict(norm_decay_mult=0))
+optimizer_config = dict(_delete_=True, grad_clip=None)
+# learning policy
+lr_config = dict(warmup_ratio=0.1, step=[65, 71])
+runner = dict(type='EpochBasedRunner', max_epochs=73)
diff --git a/configs/scratch/metafile.yml b/configs/scratch/metafile.yml
new file mode 100755
index 0000000..65025fa
--- /dev/null
+++ b/configs/scratch/metafile.yml
@@ -0,0 +1,48 @@
+Collections:
+  - Name: Rethinking ImageNet Pre-training
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - RPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1811.08883
+      Title: 'Rethinking ImageNet Pre-training'
+    README: configs/scratch/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py
+      Version: v2.0.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_gn-all_scratch_6x_coco
+    In Collection: Rethinking ImageNet Pre-training
+    Config: configs/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco.py
+    Metadata:
+      Epochs: 72
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_bbox_mAP-0.407_20200201_193013-90813d01.pth
+
+  - Name: mask_rcnn_r50_fpn_gn-all_scratch_6x_coco
+    In Collection: Rethinking ImageNet Pre-training
+    Config: configs/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco.py
+    Metadata:
+      Epochs: 72
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth
diff --git a/configs/seesaw_loss/README.md b/configs/seesaw_loss/README.md
new file mode 100755
index 0000000..696b008
--- /dev/null
+++ b/configs/seesaw_loss/README.md
@@ -0,0 +1,47 @@
+# Seesaw Loss
+
+> [Seesaw Loss for Long-Tailed Instance Segmentation](https://arxiv.org/abs/2008.10032)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Instance segmentation has witnessed a remarkable progress on class-balanced benchmarks. However, they fail to perform as accurately in real-world scenarios, where the category distribution of objects naturally comes with a long tail. Instances of head classes dominate a long-tailed dataset and they serve as negative samples of tail categories. The overwhelming gradients of negative samples on tail classes lead to a biased learning process for classifiers. Consequently, objects of tail categories are more likely to be misclassified as backgrounds or head categories. To tackle this problem, we propose Seesaw Loss to dynamically re-balance gradients of positive and negative samples for each category, with two complementary factors, i.e., mitigation factor and compensation factor. The mitigation factor reduces punishments to tail categories w.r.t. the ratio of cumulative training instances between different categories. Meanwhile, the compensation factor increases the penalty of misclassified instances to avoid false positives of tail categories. We conduct extensive experiments on Seesaw Loss with mainstream frameworks and different data sampling strategies. With a simple end-to-end training pipeline, Seesaw Loss obtains significant gains over Cross-Entropy Loss, and achieves state-of-the-art performance on LVIS dataset without bells and whistles.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143974715-d181abe5-d0a2-40d3-a2bd-17d8c60b89b8.png"/>
+</div>
+
+- Please setup [LVIS dataset](../lvis/README.md) for MMDetection.
+
+- RFS indicates to use oversample strategy [here](../../docs/tutorials/customize_dataset.md#class-balanced-dataset) with oversample threshold `1e-3`.
+
+## Results and models of Seasaw Loss on LVIS v1 dataset
+
+|       Method       | Backbone  |  Style  | Lr schd | Data Sampler | Norm Mask | box AP | mask AP |                                                                                 Config                                                                                  |                                                                                                                                                              Download                                                                                                                                                              |
+| :----------------: | :-------: | :-----: | :-----: | :----------: | :-------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     Mask R-CNN     | R-50-FPN  | pytorch |   2x    |    random    |     N     |  25.6  |  25.0   |             [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py)              |                          [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-a698dd3d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json)                          |
+|     Mask R-CNN     | R-50-FPN  | pytorch |   2x    |    random    |     Y     |  25.6  |  25.4   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py)        |              [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a1c11314.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json)              |
+|     Mask R-CNN     | R-101-FPN | pytorch |   2x    |    random    |     N     |  27.4  |  26.7   |                   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py)                   |                         [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-8e6e6dd5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json)                         |
+|     Mask R-CNN     | R-101-FPN | pytorch |   2x    |    random    |     Y     |  27.2  |  27.3   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py)       |             [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a0b59c42.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json)             |
+|     Mask R-CNN     | R-50-FPN  | pytorch |   2x    |     RFS      |     N     |  27.6  |  26.4   |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py)            |                      [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-392a804b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json)                      |
+|     Mask R-CNN     | R-50-FPN  | pytorch |   2x    |     RFS      |     Y     |  27.6  |  26.8   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py)      |          [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-cd0f6a12.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json)          |
+|     Mask R-CNN     | R-101-FPN | pytorch |   2x    |     RFS      |     N     |  28.9  |  27.6   |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py)           |                     [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-e68eb464.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json)                     |
+|     Mask R-CNN     | R-101-FPN | pytorch |   2x    |     RFS      |     Y     |  28.9  |  28.2   |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py)     |         [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-1d817139.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json)         |
+| Cascade Mask R-CNN | R-101-FPN | pytorch |   2x    |    random    |     N     |  33.1  |  29.2   |         [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py)         |                 [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-71e2215e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json)                 |
+| Cascade Mask R-CNN | R-101-FPN | pytorch |   2x    |    random    |     Y     |  33.0  |  30.0   |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py)   |     [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-8b5a6745.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json)     |
+| Cascade Mask R-CNN | R-101-FPN | pytorch |   2x    |     RFS      |     N     |  30.0  |  29.3   |       [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py)       |             [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-5d8ca2a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json)             |
+| Cascade Mask R-CNN | R-101-FPN | pytorch |   2x    |     RFS      |     Y     |  32.8  |  30.1   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-c8551505.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) |
+
+## Citation
+
+We provide config files to reproduce the instance segmentation performance in the CVPR 2021 paper for [Seesaw Loss for Long-Tailed Instance Segmentation](https://arxiv.org/abs/2008.10032).
+
+```latex
+@inproceedings{wang2021seesaw,
+  title={Seesaw Loss for Long-Tailed Instance Segmentation},
+  author={Jiaqi Wang and Wenwei Zhang and Yuhang Zang and Yuhang Cao and Jiangmiao Pang and Tao Gong and Kai Chen and Ziwei Liu and Chen Change Loy and Dahua Lin},
+  booktitle={Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
+  year={2021}
+}
+```
diff --git a/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..beeb0d1
--- /dev/null
+++ b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/lvis_v1/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_train.json',
+        img_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root,
+        pipeline=test_pipeline))
+evaluation = dict(interval=24, metric=['bbox', 'segm'])
diff --git a/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..0f29948
--- /dev/null
+++ b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
@@ -0,0 +1,5 @@
+_base_ = './cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py'  # noqa: E501
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..bb88750
--- /dev/null
+++ b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
@@ -0,0 +1,98 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v1_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(dataset=dict(pipeline=train_pipeline)))
+evaluation = dict(interval=24, metric=['bbox', 'segm'])
diff --git a/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..262e76b
--- /dev/null
+++ b/configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
@@ -0,0 +1,5 @@
+_base_ = './cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py'  # noqa: E501
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..57deab1
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..a539929
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py'  # noqa: E501
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..1f5065e
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..13d0b5f
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py'  # noqa: E501
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..743f5f2
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
@@ -0,0 +1,75 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=1203,
+            cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+            loss_cls=dict(
+                type='SeesawLoss',
+                p=0.8,
+                q=2.0,
+                num_classes=1203,
+                loss_weight=1.0)),
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/lvis_v1/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_train.json',
+        img_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root,
+        pipeline=test_pipeline))
+evaluation = dict(interval=24, metric=['bbox', 'segm'])
diff --git a/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..0af8921
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
@@ -0,0 +1,5 @@
+_base_ = './mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py'
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..4fc1504
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v1_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=1203,
+            cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+            loss_cls=dict(
+                type='SeesawLoss',
+                p=0.8,
+                q=2.0,
+                num_classes=1203,
+                loss_weight=1.0)),
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(dataset=dict(pipeline=train_pipeline)))
+evaluation = dict(interval=12, metric=['bbox', 'segm'])
diff --git a/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py b/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
new file mode 100755
index 0000000..0ef6bd2
--- /dev/null
+++ b/configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
@@ -0,0 +1,5 @@
+_base_ = './mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py'
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/configs/seesaw_loss/metafile.yml b/configs/seesaw_loss/metafile.yml
new file mode 100755
index 0000000..fb90aa5
--- /dev/null
+++ b/configs/seesaw_loss/metafile.yml
@@ -0,0 +1,203 @@
+Collections:
+  - Name: Seesaw Loss
+    Metadata:
+      Training Data: LVIS
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+        - Seesaw Loss
+    Paper:
+      URL: https://arxiv.org/abs/2008.10032
+      Title: 'Seesaw Loss for Long-Tailed Instance Segmentation'
+    README: configs/seesaw_loss/README.md
+
+Models:
+  - Name: mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 25.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 25.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-a698dd3d.pth
+  - Name: mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 25.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 25.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a1c11314.pth
+  - Name: mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.4
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 26.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-8e6e6dd5.pth
+  - Name: mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.2
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 27.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a0b59c42.pth
+  - Name: mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 26.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-392a804b.pth
+  - Name: mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 26.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-cd0f6a12.pth
+  - Name: mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 28.9
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 27.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-e68eb464.pth
+  - Name: mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 28.9
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 28.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-1d817139.pth
+  - Name: cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 33.1
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 29.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-71e2215e.pth
+  - Name: cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 33.0
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 30.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-8b5a6745.pth
+  - Name: cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 30.0
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 29.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-5d8ca2a4.pth
+  - Name: cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 32.8
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 30.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-c8551505.pth
diff --git a/configs/selfsup_pretrain/README.md b/configs/selfsup_pretrain/README.md
new file mode 100755
index 0000000..9bd92cb
--- /dev/null
+++ b/configs/selfsup_pretrain/README.md
@@ -0,0 +1,109 @@
+# Backbones Trained by Self-Supervise Algorithms
+
+<!-- [OTHERS] -->
+
+## Abstract
+
+Unsupervised image representations have significantly reduced the gap with supervised pretraining, notably with the recent achievements of contrastive learning methods. These contrastive methods typically work online and rely on a large number of explicit pairwise feature comparisons, which is computationally challenging. In this paper, we propose an online algorithm, SwAV, that takes advantage of contrastive methods without requiring to compute pairwise comparisons. Specifically, our method simultaneously clusters the data while enforcing consistency between cluster assignments produced for different augmentations (or views) of the same image, instead of comparing features directly as in contrastive learning. Simply put, we use a swapped prediction mechanism where we predict the cluster assignment of a view from the representation of another view. Our method can be trained with large and small batches and can scale to unlimited amounts of data. Compared to previous contrastive methods, our method is more memory efficient since it does not require a large memory bank or a special momentum network. In addition, we also propose a new data augmentation strategy, multi-crop, that uses a mix of views with different resolutions in place of two full-resolution views, without increasing the memory or compute requirements much. We validate our findings by achieving 75.3% top-1 accuracy on ImageNet with ResNet-50, as well as surpassing supervised pretraining on all the considered transfer tasks.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143997246-ac40fd8a-9b48-4ff5-a0d9-ba10e1e333d2.png"/>
+</div>
+
+We present Momentum Contrast (MoCo) for unsupervised visual representation learning. From a perspective on contrastive learning as dictionary look-up, we build a dynamic dictionary with a queue and a moving-averaged encoder. This enables building a large and consistent dictionary on-the-fly that facilitates contrastive unsupervised learning. MoCo provides competitive results under the common linear protocol on ImageNet classification. More importantly, the representations learned by MoCo transfer well to downstream tasks. MoCo can outperform its supervised pre-training counterpart in 7 detection/segmentation tasks on PASCAL VOC, COCO, and other datasets, sometimes surpassing it by large margins. This suggests that the gap between unsupervised and supervised representation learning has been largely closed in many vision tasks.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143997315-5ff824d4-1007-4b59-8952-bc5a2c0bfd78.png" height="300"/>
+</div>
+
+## Usage
+
+To use a self-supervisely pretrained backbone, there are two steps to do:
+
+1. Download and convert the model to PyTorch-style supported by MMDetection
+2. Modify the config and change the training setting accordingly
+
+### Convert model
+
+For more general usage, we also provide script `selfsup2mmdet.py` in the tools directory to convert the key of models pretrained by different self-supervised methods to PyTorch-style checkpoints used in MMDetection.
+
+```bash
+python -u tools/model_converters/selfsup2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH} --selfsup ${method}
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+For example, to use a ResNet-50 backbone released by MoCo, you can download it from [here](https://dl.fbaipublicfiles.com/moco/moco_checkpoints/moco_v2_800ep/moco_v2_800ep_pretrain.pth.tar) and use the following command
+
+```bash
+python -u tools/model_converters/selfsup2mmdet.py ./moco_v2_800ep_pretrain.pth.tar mocov2_r50_800ep_pretrain.pth --selfsup moco
+```
+
+To use the ResNet-50 backbone released by SwAV, you can download it from [here](https://dl.fbaipublicfiles.com/deepcluster/swav_800ep_pretrain.pth.tar)
+
+### Modify config
+
+The backbone requires SyncBN and the `frozen_stages` need to be changed. A config that use the moco backbone is as below
+
+```python
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    pretrained='./mocov2_r50_800ep_pretrain.pth',
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False))
+
+```
+
+## Results and Models
+
+|  Method   |                              Backbone                               |  Style  |    Lr schd     | Mem (GB) | Inf time (fps) | box AP | mask AP |                                                                  Config                                                                  |                                                                                                                                                                                            Download                                                                                                                                                                                            |
+| :-------: | :-----------------------------------------------------------------: | :-----: | :------------: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Mask RCNN |  [R50 by MoCo v2](./mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py)   | pytorch |       1x       |          |                |  38.0  |  34.3   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py)   |       [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco_20210604_114614-a8b63483.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco_20210604_114614.log.json)       |
+| Mask RCNN | [R50 by MoCo v2](./mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py) | pytorch | multi-scale 2x |          |                |  40.8  |  36.8   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco_20210605_163717-d95df20a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco_20210605_163717.log.json) |
+| Mask RCNN |     [R50 by SwAV](./mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py)     | pytorch |       1x       |          |                |  39.1  |  35.7   |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py)    |           [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco/mask_rcnn_r50_fpn_swav-pretrain_1x_coco_20210604_114640-7b9baf28.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco/mask_rcnn_r50_fpn_swav-pretrain_1x_coco_20210604_114640.log.json)           |
+| Mask RCNN |   [R50 by SwAV](./mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py)    | pytorch | multi-scale 2x |          |                |  41.3  |  37.3   |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py)  |     [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco_20210605_163717-08e26fca.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco_20210605_163717.log.json)     |
+
+### Notice
+
+1. We only provide single-scale 1x and multi-scale 2x configs as examples to show how to use backbones trained by self-supervised algorithms. We will try to reproduce the results in their corresponding paper using the released backbone in the future. Please stay tuned.
+
+## Citation
+
+We support to apply the backbone models pre-trained by different self-supervised methods in detection systems and provide their results on Mask R-CNN.
+
+The pre-trained models are converted from [MoCo](https://github.com/facebookresearch/moco) and downloaded from [SwAV](https://github.com/facebookresearch/swav).
+
+For SwAV, please cite
+
+```latex
+@article{caron2020unsupervised,
+  title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments},
+  author={Caron, Mathilde and Misra, Ishan and Mairal, Julien and Goyal, Priya and Bojanowski, Piotr and Joulin, Armand},
+  booktitle={Proceedings of Advances in Neural Information Processing Systems (NeurIPS)},
+  year={2020}
+}
+```
+
+For MoCo, please cite
+
+```latex
+@Article{he2019moco,
+  author  = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick},
+  title   = {Momentum Contrast for Unsupervised Visual Representation Learning},
+  journal = {arXiv preprint arXiv:1911.05722},
+  year    = {2019},
+}
+@Article{chen2020mocov2,
+  author  = {Xinlei Chen and Haoqi Fan and Ross Girshick and Kaiming He},
+  title   = {Improved Baselines with Momentum Contrastive Learning},
+  journal = {arXiv preprint arXiv:2003.04297},
+  year    = {2020},
+}
+```
diff --git a/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py
new file mode 100755
index 0000000..f1e0615
--- /dev/null
+++ b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth')))
diff --git a/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py
new file mode 100755
index 0000000..09aa156
--- /dev/null
+++ b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco.py
@@ -0,0 +1,32 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth')))
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py
new file mode 100755
index 0000000..f92a345
--- /dev/null
+++ b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar')))
diff --git a/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py
new file mode 100755
index 0000000..fe47361
--- /dev/null
+++ b/configs/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco.py
@@ -0,0 +1,32 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar')))
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/simple_copy_paste/README.md b/configs/simple_copy_paste/README.md
new file mode 100755
index 0000000..46162aa
--- /dev/null
+++ b/configs/simple_copy_paste/README.md
@@ -0,0 +1,38 @@
+# SimpleCopyPaste
+
+> [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https://arxiv.org/abs/2012.07177)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Building instance segmentation models that are data-efficient and can handle rare object categories is an important challenge in computer vision. Leveraging data augmentations is a promising direction towards addressing this challenge. Here, we perform a systematic study of the Copy-Paste augmentation (\[13, 12\]) for instance segmentation where we randomly paste objects onto an image. Prior studies on Copy-Paste relied on modeling the surrounding visual context for pasting the objects. However, we find that the simple mechanism of pasting objects randomly is good enough and can provide solid gains on top of strong baselines. Furthermore, we show Copy-Paste is additive with semi-supervised methods that leverage extra data through pseudo labeling (e.g. self-training). On COCO instance segmentation, we achieve 49.1 mask AP and 57.3 box AP, an improvement of +0.6 mask AP and +1.5 box AP over the previous state-of-the-art. We further demonstrate that Copy-Paste can lead to significant improvements on the LVIS benchmark. Our baseline model outperforms the LVIS 2020 Challenge winning entry by +3.6 mask AP on rare categories.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/161843866-c5b769da-58b2-4c1f-8078-db4a4ded3881.png"/>
+</div>
+
+## Results and Models
+
+### Mask R-CNN with Standard Scale Jittering (SSJ) and Simple Copy-Paste(SCP)
+
+Standard Scale Jittering(SSJ) resizes and crops an image with a resize range of 0.8 to 1.25 of the original image size, and Simple Copy-Paste(SCP) selects a random subset of objects from one of the images and pastes them onto the other image.
+
+| Backbone | Training schedule | Augmentation | batch size | box AP | mask AP |                                                                           Config                                                                           |                                                                                                                                                                                                                               Download                                                                                                                                                                                                                               |
+| :------: | :---------------: | :----------: | :--------: | :----: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   |        90k        |     SSJ      |     64     |  43.3  |  39.0   |   [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py)    |           [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco_20220316_181409-f79c84c5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco_20220316_181409.log.json)           |
+|   R-50   |        90k        |   SSJ+SCP    |     64     |  43.8  |  39.2   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco_20220316_181307-6bc5726f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco_20220316_181307.log.json)   |
+|   R-50   |       270k        |     SSJ      |     64     |  43.5  |  39.1   |   [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py)   |         [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco_20220324_182940-33a100c5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco_20220324_182940.log.json)         |
+|   R-50   |       270k        |   SSJ+SCP    |     64     |  45.1  |  40.3   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco_20220324_201229-80ee90b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco_20220324_201229.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{ghiasi2021simple,
+  title={Simple copy-paste is a strong data augmentation method for instance segmentation},
+  author={Ghiasi, Golnaz and Cui, Yin and Srinivas, Aravind and Qian, Rui and Lin, Tsung-Yi and Cubuk, Ekin D and Le, Quoc V and Zoph, Barret},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2918--2928},
+  year={2021}
+}
+```
diff --git a/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py
new file mode 100755
index 0000000..d0ce917
--- /dev/null
+++ b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py
@@ -0,0 +1,20 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs
+    '../common/ssj_270k_coco_instance.py',
+]
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed.
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),  # leads to 0.1+ mAP
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
diff --git a/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py
new file mode 100755
index 0000000..1eee95f
--- /dev/null
+++ b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py'
+
+# lr steps at [0.9, 0.95, 0.975] of the maximum iterations
+lr_config = dict(
+    warmup_iters=500, warmup_ratio=0.067, step=[81000, 85500, 87750])
+# 90k iterations with batch_size 64 is roughly equivalent to 48 epochs
+runner = dict(type='IterBasedRunner', max_iters=90000)
diff --git a/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py
new file mode 100755
index 0000000..bd28ddd
--- /dev/null
+++ b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py
@@ -0,0 +1,20 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs
+    '../common/ssj_scp_270k_coco_instance.py'
+]
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed.
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),  # leads to 0.1+ mAP
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
diff --git a/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py
new file mode 100755
index 0000000..b632c13
--- /dev/null
+++ b/configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py'
+
+# lr steps at [0.9, 0.95, 0.975] of the maximum iterations
+lr_config = dict(
+    warmup_iters=500, warmup_ratio=0.067, step=[81000, 85500, 87750])
+# 90k iterations with batch_size 64 is roughly equivalent to 48 epochs
+runner = dict(type='IterBasedRunner', max_iters=90000)
diff --git a/configs/simple_copy_paste/metafile.yml b/configs/simple_copy_paste/metafile.yml
new file mode 100755
index 0000000..bb6106c
--- /dev/null
+++ b/configs/simple_copy_paste/metafile.yml
@@ -0,0 +1,92 @@
+Collections:
+  - Name: SimpleCopyPaste
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 32x A100 GPUs
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/2012.07177
+      Title: "Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation"
+    README: configs/simple_copy_paste/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.25.0/mmdet/datasets/pipelines/transforms.py#L2762
+      Version: v2.25.0
+
+Models:
+  - Name: mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simplecopypaste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 270000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco_20220324_182940-33a100c5.pth
+
+  - Name: mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simplecopypaste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 90000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco_20220316_181409-f79c84c5.pth
+
+  - Name: mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simplecopypaste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 270000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco_20220324_201229-80ee90b7.pth
+
+  - Name: mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simplecopypaste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 90000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco_20220316_181307-6bc5726f.pth
diff --git a/configs/solo/README.md b/configs/solo/README.md
new file mode 100755
index 0000000..4a36676
--- /dev/null
+++ b/configs/solo/README.md
@@ -0,0 +1,54 @@
+# SOLO
+
+> [SOLO: Segmenting Objects by Locations](https://arxiv.org/abs/1912.04488)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a new, embarrassingly simple approach to instance segmentation in images. Compared to many other dense prediction tasks, e.g., semantic segmentation, it is the arbitrary number of instances that have made instance segmentation much more challenging. In order to predict a mask for each instance, mainstream approaches either follow the 'detect-thensegment' strategy as used by Mask R-CNN, or predict category masks first then use clustering techniques to group pixels into individual instances. We view the task of instance segmentation from a completely new perspective by introducing the notion of "instance categories", which assigns categories to each pixel within an instance according to the instance's location and size, thus nicely converting instance mask segmentation into a classification-solvable problem. Now instance segmentation is decomposed into two classification tasks. We demonstrate a much simpler and flexible instance segmentation framework with strong performance, achieving on par accuracy with Mask R-CNN and outperforming recent singleshot instance segmenters in accuracy. We hope that this very simple and strong framework can serve as a baseline for many instance-level recognition tasks besides instance segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143998371-10e6f14b-4506-481d-91a7-5f8f58213307.png"/>
+</div>
+
+## Results and Models
+
+### SOLO
+
+| Backbone |  Style  | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP |                                                                                                                                Download                                                                                                                                |
+| :------: | :-----: | :------: | :-----: | :------: | :------------: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |    N     |   1x    |   8.0    |      14.0      |  33.1   | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055-2290a6b8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055.log.json) |
+|   R-50   | pytorch |    Y     |   3x    |   7.4    |      14.0      |  35.9   | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353.log.json) |
+
+### Decoupled SOLO
+
+| Backbone |  Style  | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP |                                                                                                                                                    Download                                                                                                                                                    |
+| :------: | :-----: | :------: | :-----: | :------: | :------------: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |    N     |   1x    |   7.8    |      12.5      |  33.9   | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348.log.json) |
+|   R-50   | pytorch |    Y     |   3x    |   7.9    |      12.5      |  36.7   | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504-7b3301ec.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504.log.json) |
+
+- Decoupled SOLO has a decoupled head which is different from SOLO head.
+  Decoupled SOLO serves as an efficient and equivalent variant in accuracy
+  of SOLO. Please refer to the corresponding config files for details.
+
+### Decoupled Light SOLO
+
+| Backbone |  Style  | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP |                                                                                                                                                                Download                                                                                                                                                                |
+| :------: | :-----: | :------: | :-----: | :------: | :------------: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | pytorch |    Y     |   3x    |   2.2    |      31.2      |  32.9   | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703-e70e226f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703.log.json) |
+
+- Decoupled Light SOLO using decoupled structure similar to Decoupled
+  SOLO head, with light-weight head and smaller input size, Please refer
+  to the corresponding config files for details.
+
+## Citation
+
+```latex
+@inproceedings{wang2020solo,
+  title     =  {{SOLO}: Segmenting Objects by Locations},
+  author    =  {Wang, Xinlong and Kong, Tao and Shen, Chunhua and Jiang, Yuning and Li, Lei},
+  booktitle =  {Proc. Eur. Conf. Computer Vision (ECCV)},
+  year      =  {2020}
+}
+```
diff --git a/configs/solo/decoupled_solo_light_r50_fpn_3x_coco.py b/configs/solo/decoupled_solo_light_r50_fpn_3x_coco.py
new file mode 100755
index 0000000..101f8f1
--- /dev/null
+++ b/configs/solo/decoupled_solo_light_r50_fpn_3x_coco.py
@@ -0,0 +1,63 @@
+_base_ = './decoupled_solo_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    mask_head=dict(
+        type='DecoupledSOLOLightHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(852, 512), (852, 480), (852, 448), (852, 416), (852, 384),
+                   (852, 352)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(852, 512),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/solo/decoupled_solo_r50_fpn_1x_coco.py b/configs/solo/decoupled_solo_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..b611cdf
--- /dev/null
+++ b/configs/solo/decoupled_solo_r50_fpn_1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = [
+    './solo_r50_fpn_1x_coco.py',
+]
+# model settings
+model = dict(
+    mask_head=dict(
+        type='DecoupledSOLOHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=7,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+
+optimizer = dict(type='SGD', lr=0.01)
diff --git a/configs/solo/decoupled_solo_r50_fpn_3x_coco.py b/configs/solo/decoupled_solo_r50_fpn_3x_coco.py
new file mode 100755
index 0000000..4a8c19d
--- /dev/null
+++ b/configs/solo/decoupled_solo_r50_fpn_3x_coco.py
@@ -0,0 +1,25 @@
+_base_ = './solo_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    mask_head=dict(
+        type='DecoupledSOLOHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=7,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
diff --git a/configs/solo/metafile.yml b/configs/solo/metafile.yml
new file mode 100755
index 0000000..b6244e8
--- /dev/null
+++ b/configs/solo/metafile.yml
@@ -0,0 +1,115 @@
+Collections:
+  - Name: SOLO
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Convolution
+        - ResNet
+    Paper: https://arxiv.org/abs/1912.04488
+    README: configs/solo/README.md
+
+Models:
+  - Name: decoupled_solo_r50_fpn_1x_coco
+    In Collection: SOLO
+    Config: configs/solo/decoupled_solo_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 12
+    inference time (ms/im):
+      - value: 116.4
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 33.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth
+
+  - Name: decoupled_solo_r50_fpn_3x_coco
+    In Collection: SOLO
+    Config: configs/solo/decoupled_solo_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      Epochs: 36
+    inference time (ms/im):
+      - value: 117.2
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504-7b3301ec.pth
+
+  - Name: decoupled_solo_light_r50_fpn_3x_coco
+    In Collection: SOLO
+    Config: configs/solo/decoupled_solo_light_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.2
+      Epochs: 36
+    inference time (ms/im):
+      - value: 35.0
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (852, 512)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 32.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703-e70e226f.pth
+
+  - Name: solo_r50_fpn_3x_coco
+    In Collection: SOLO
+    Config: configs/solo/solo_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 36
+    inference time (ms/im):
+      - value: 94.2
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth
+
+  - Name: solo_r50_fpn_1x_coco
+    In Collection: SOLO
+    Config: configs/solo/solo_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      Epochs: 12
+    inference time (ms/im):
+      - value: 95.1
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 33.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055-2290a6b8.pth
diff --git a/configs/solo/solo_r50_fpn_1x_coco.py b/configs/solo/solo_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..9093a50
--- /dev/null
+++ b/configs/solo/solo_r50_fpn_1x_coco.py
@@ -0,0 +1,53 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='SOLO',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        num_outs=5),
+    mask_head=dict(
+        type='SOLOHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=7,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
+    # model training and testing settings
+    test_cfg=dict(
+        nms_pre=500,
+        score_thr=0.1,
+        mask_thr=0.5,
+        filter_thr=0.05,
+        kernel='gaussian',  # gaussian/linear
+        sigma=2.0,
+        max_per_img=100))
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.01)
diff --git a/configs/solo/solo_r50_fpn_3x_coco.py b/configs/solo/solo_r50_fpn_3x_coco.py
new file mode 100755
index 0000000..52302cd
--- /dev/null
+++ b/configs/solo/solo_r50_fpn_3x_coco.py
@@ -0,0 +1,28 @@
+_base_ = './solo_r50_fpn_1x_coco.py'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 800), (1333, 768), (1333, 736), (1333, 704),
+                   (1333, 672), (1333, 640)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/solov2/README.md b/configs/solov2/README.md
new file mode 100755
index 0000000..2ffe70f
--- /dev/null
+++ b/configs/solov2/README.md
@@ -0,0 +1,59 @@
+# SOLOv2
+
+> [SOLOv2: Dynamic and Fast Instance Segmentation](https://arxiv.org/abs/2003.10152)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this work, we aim at building a simple, direct, and fast instance segmentation
+framework with strong performance. We follow the principle of the SOLO method of
+Wang et al. "SOLO: segmenting objects by locations". Importantly, we take one
+step further by dynamically learning the mask head of the object segmenter such
+that the mask head is conditioned on the location. Specifically, the mask branch
+is decoupled into a mask kernel branch and mask feature branch, which are
+responsible for learning the convolution kernel and the convolved features
+respectively. Moreover, we propose Matrix NMS (non maximum suppression) to
+significantly reduce the inference time overhead due to NMS of masks. Our
+Matrix NMS performs NMS with parallel matrix operations in one shot, and
+yields better results. We demonstrate a simple direct instance segmentation
+system, outperforming a few state-of-the-art methods in both speed and accuracy.
+A light-weight version of SOLOv2 executes at 31.3 FPS and yields 37.1% AP.
+Moreover, our state-of-the-art results in object detection (from our mask byproduct)
+and panoptic segmentation show the potential to serve as a new strong baseline
+for many instance-level recognition tasks besides instance segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/48282753/167235090-f20dab74-43a5-44ed-9f11-4e5f08866f45.png"/>
+</div>
+
+## Results and Models
+
+### SOLOv2
+
+|  Backbone  |  Style  | MS train | Lr schd | Mem (GB) | mask AP |                                                    Config                                                     |                                                                                                                                                Download                                                                                                                                                |
+| :--------: | :-----: | :------: | :-----: | :------: | :-----: | :-----------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50    | pytorch |    N     |   1x    |   5.1    |  34.8   |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_r50_fpn_1x_coco.py)    |      [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858-a357fa23.pth)           \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858.log.json)      |
+|    R-50    | pytorch |    Y     |   3x    |   5.1    |  37.5   |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_r50_fpn_3x_coco.py)    |      [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_3x_coco/solov2_r50_fpn_3x_coco_20220512_125856-fed092d4.pth)           \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_3x_coco/solov2_r50_fpn_3x_coco_20220512_125856.log.json)      |
+|   R-101    | pytorch |    Y     |   3x    |   6.9    |  39.1   |   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_r101_fpn_3x_coco.py)   |     [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_fpn_3x_coco/solov2_r101_fpn_3x_coco_20220511_095119-c559a076.pth)         \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_fpn_3x_coco/solov2_r101_fpn_3x_coco_20220511_095119.log.json)     |
+| R-101(DCN) | pytorch |    Y     |   3x    |   7.1    |  41.2   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_r101_dcn_fpn_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_dcn_fpn_3x_coco/solov2_r101_dcn_fpn_3x_coco_20220513_214734-16c966cb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_dcn_fpn_3x_coco/solov2_r101_dcn_fpn_3x_coco_20220513_214734.log.json) |
+| X-101(DCN) | pytorch |    Y     |   3x    |   11.3   |  42.4   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_x101_dcn_fpn_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_x101_dcn_fpn_3x_coco/solov2_x101_dcn_fpn_3x_coco_20220513_214337-aef41095.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_x101_dcn_fpn_3x_coco/solov2_x101_dcn_fpn_3x_coco_20220513_214337.log.json) |
+
+### Light SOLOv2
+
+| Backbone |  Style  | MS train | Lr schd | Mem (GB) | mask AP |                                                     Config                                                     |                                                                                                                                                  Download                                                                                                                                                  |
+| :------: | :-----: | :------: | :-----: | :------: | :-----: | :------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-18   | pytorch |    Y     |   3x    |   9.1    |  29.7   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_light_r18_fpn_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r18_fpn_3x_coco/solov2_light_r18_fpn_3x_coco_20220511_083717-75fa355b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r18_fpn_3x_coco/solov2_light_r18_fpn_3x_coco_20220511_083717.log.json) |
+|   R-34   | pytorch |    Y     |   3x    |   9.3    |  31.9   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_light_r34_fpn_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r34_fpn_3x_coco/solov2_light_r34_fpn_3x_coco_20220511_091839-e51659d3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r34_fpn_3x_coco/solov2_light_r34_fpn_3x_coco_20220511_091839.log.json) |
+|   R-50   | pytorch |    Y     |   3x    |   9.9    |  33.7   | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/solov2/solov2_light_r50_fpn_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r50_fpn_3x_coco/solov2_light_r50_fpn_3x_coco_20220512_165256-c93a6074.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r50_fpn_3x_coco/solov2_light_r50_fpn_3x_coco_20220512_165256.log.json) |
+
+## Citation
+
+```latex
+@article{wang2020solov2,
+  title={SOLOv2: Dynamic and Fast Instance Segmentation},
+  author={Wang, Xinlong and Zhang, Rufeng and  Kong, Tao and Li, Lei and Shen, Chunhua},
+  journal={Proc. Advances in Neural Information Processing Systems (NeurIPS)},
+  year={2020}
+}
+```
diff --git a/configs/solov2/metafile.yml b/configs/solov2/metafile.yml
new file mode 100755
index 0000000..656f66f
--- /dev/null
+++ b/configs/solov2/metafile.yml
@@ -0,0 +1,119 @@
+Collections:
+  - Name: SOLOv2
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - FPN
+        - Convolution
+        - ResNet
+    Paper: https://arxiv.org/abs/2003.10152
+    README: configs/solov2/README.md
+
+Models:
+  - Name: solov2_r50_fpn_1x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      Epochs: 12
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858-a357fa23.pth
+
+  - Name: solov2_r50_fpn_3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_3x_coco/solov2_r50_fpn_3x_coco_20220512_125856-fed092d4.pth
+
+  - Name: solov2_r101_fpn_3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_r101_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.9
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_fpn_3x_coco/solov2_r101_fpn_3x_coco_20220511_095119-c559a076.pth
+
+  - Name: solov2_r101_dcn_fpn_3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_r101_dcn_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_dcn_fpn_3x_coco/solov2_r101_dcn_fpn_3x_coco_20220513_214734-16c966cb.pth
+
+  - Name: solov2_x101_dcn_fpn_3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_x101_dcn_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 11.3
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_x101_dcn_fpn_3x_coco/solov2_x101_dcn_fpn_3x_coco_20220513_214337-aef41095.pth
+
+  - Name: solov2_light_r18_fpn_3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_light_r18_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.1
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 29.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r18_fpn_3x_coco/solov2_light_r18_fpn_3x_coco_20220511_083717-75fa355b.pth
+
+  - Name: solov2_light_r34_fpn_3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_light_r34_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.3
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 31.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r34_fpn_3x_coco/solov2_light_r34_fpn_3x_coco_20220511_091839-e51659d3.pth
+
+  - Name: solov2_light_r50_fpn_3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_light_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 33.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r50_fpn_3x_coco/solov2_light_r50_fpn_3x_coco_20220512_165256-c93a6074.pth
diff --git a/configs/solov2/solov2_light_r18_fpn_3x_coco.py b/configs/solov2/solov2_light_r18_fpn_3x_coco.py
new file mode 100755
index 0000000..6fb33b0
--- /dev/null
+++ b/configs/solov2/solov2_light_r18_fpn_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'solov2_light_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=18, init_cfg=dict(checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/configs/solov2/solov2_light_r34_fpn_3x_coco.py b/configs/solov2/solov2_light_r34_fpn_3x_coco.py
new file mode 100755
index 0000000..ea082a1
--- /dev/null
+++ b/configs/solov2/solov2_light_r34_fpn_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'solov2_light_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=34, init_cfg=dict(checkpoint='torchvision://resnet34')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/configs/solov2/solov2_light_r50_dcn_fpn_3x_coco.py b/configs/solov2/solov2_light_r50_dcn_fpn_3x_coco.py
new file mode 100755
index 0000000..4d758e2
--- /dev/null
+++ b/configs/solov2/solov2_light_r50_dcn_fpn_3x_coco.py
@@ -0,0 +1,62 @@
+_base_ = 'solov2_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    mask_head=dict(
+        feat_channels=256,
+        stacked_convs=3,
+        scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)),
+        mask_feature_head=dict(out_channels=128),
+        dcn_cfg=dict(type='DCNv2'),
+        dcn_apply_to_all_conv=False))  # light solov2 head
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+
+# data
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(768, 512), (768, 480), (768, 448), (768, 416), (768, 384),
+                   (768, 352)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(448, 768),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/solov2/solov2_light_r50_fpn_3x_coco.py b/configs/solov2/solov2_light_r50_fpn_3x_coco.py
new file mode 100755
index 0000000..e08f1db
--- /dev/null
+++ b/configs/solov2/solov2_light_r50_fpn_3x_coco.py
@@ -0,0 +1,57 @@
+_base_ = 'solov2_r50_fpn_1x_coco.py'
+
+# model settings
+model = dict(
+    mask_head=dict(
+        stacked_convs=2,
+        feat_channels=256,
+        scale_ranges=((1, 56), (28, 112), (56, 224), (112, 448), (224, 896)),
+        mask_feature_head=dict(out_channels=128)))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
+
+# data
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(768, 512), (768, 480), (768, 448), (768, 416), (768, 384),
+                   (768, 352)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(448, 768),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/solov2/solov2_r101_dcn_fpn_3x_coco.py b/configs/solov2/solov2_r101_dcn_fpn_3x_coco.py
new file mode 100755
index 0000000..1594118
--- /dev/null
+++ b/configs/solov2/solov2_r101_dcn_fpn_3x_coco.py
@@ -0,0 +1,13 @@
+_base_ = 'solov2_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    mask_head=dict(
+        mask_feature_head=dict(conv_cfg=dict(type='DCNv2')),
+        dcn_cfg=dict(type='DCNv2'),
+        dcn_apply_to_all_conv=True))
diff --git a/configs/solov2/solov2_r101_fpn_3x_coco.py b/configs/solov2/solov2_r101_fpn_3x_coco.py
new file mode 100755
index 0000000..6c248e5
--- /dev/null
+++ b/configs/solov2/solov2_r101_fpn_3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = 'solov2_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101, init_cfg=dict(checkpoint='torchvision://resnet101')))
diff --git a/configs/solov2/solov2_r50_fpn_1x_coco.py b/configs/solov2/solov2_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..9aee571
--- /dev/null
+++ b/configs/solov2/solov2_r50_fpn_1x_coco.py
@@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='SOLOv2',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        num_outs=5),
+    mask_head=dict(
+        type='SOLOV2Head',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=512,
+        stacked_convs=4,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        mask_feature_head=dict(
+            feat_channels=128,
+            start_level=0,
+            end_level=3,
+            out_channels=256,
+            mask_stride=4,
+            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
+        loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0)),
+    # model training and testing settings
+    test_cfg=dict(
+        nms_pre=500,
+        score_thr=0.1,
+        mask_thr=0.5,
+        filter_thr=0.05,
+        kernel='gaussian',  # gaussian/linear
+        sigma=2.0,
+        max_per_img=100))
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/configs/solov2/solov2_r50_fpn_3x_coco.py b/configs/solov2/solov2_r50_fpn_3x_coco.py
new file mode 100755
index 0000000..640c730
--- /dev/null
+++ b/configs/solov2/solov2_r50_fpn_3x_coco.py
@@ -0,0 +1,28 @@
+_base_ = 'solov2_r50_fpn_1x_coco.py'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 800), (1333, 768), (1333, 736), (1333, 704),
+                   (1333, 672), (1333, 640)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/solov2/solov2_x101_dcn_fpn_3x_coco.py b/configs/solov2/solov2_x101_dcn_fpn_3x_coco.py
new file mode 100755
index 0000000..6115fed
--- /dev/null
+++ b/configs/solov2/solov2_x101_dcn_fpn_3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'solov2_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')),
+    mask_head=dict(
+        mask_feature_head=dict(conv_cfg=dict(type='DCNv2')),
+        dcn_cfg=dict(type='DCNv2'),
+        dcn_apply_to_all_conv=True))
diff --git a/configs/sparse_rcnn/README.md b/configs/sparse_rcnn/README.md
new file mode 100755
index 0000000..d7912e0
--- /dev/null
+++ b/configs/sparse_rcnn/README.md
@@ -0,0 +1,38 @@
+# Sparse R-CNN
+
+> [Sparse R-CNN: End-to-End Object Detection with Learnable Proposals](https://arxiv.org/abs/2011.12450)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present Sparse R-CNN, a purely sparse method for object detection in images. Existing works on object detection heavily rely on dense object candidates, such as k anchor boxes pre-defined on all grids of image feature map of size H×W. In our method, however, a fixed sparse set of learned object proposals, total length of N, are provided to object recognition head to perform classification and location. By eliminating HWk (up to hundreds of thousands) hand-designed object candidates to N (e.g. 100) learnable proposals, Sparse R-CNN completely avoids all efforts related to object candidates design and many-to-one label assignment. More importantly, final predictions are directly output without non-maximum suppression post-procedure. Sparse R-CNN demonstrates accuracy, run-time and training convergence performance on par with the well-established detector baselines on the challenging COCO dataset, e.g., achieving 45.0 AP in standard 3× training schedule and running at 22 fps using ResNet-50 FPN model. We hope our work could inspire re-thinking the convention of dense prior in object detectors.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143998489-8a5a687d-ceec-4590-8347-708e427e7dfe.png" height="300"/>
+</div>
+
+## Results and Models
+
+|    Model     | Backbone  |  Style  | Lr schd | Number of Proposals | Multi-Scale | RandomCrop | box AP |                                                                         Config                                                                         |                                                                                                                                                                                                                                 Download                                                                                                                                                                                                                                  |
+| :----------: | :-------: | :-----: | :-----: | :-----------------: | :---------: | :--------: | :----: | :----------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Sparse R-CNN | R-50-FPN  | pytorch |   1x    |         100         |    False    |   False    |  37.9  |                   [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py)                   |                                                                         [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.log.json)                                                                         |
+| Sparse R-CNN | R-50-FPN  | pytorch |   3x    |         100         |    True     |   False    |  42.8  |           [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py)           |                                         [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.log.json)                                         |
+| Sparse R-CNN | R-50-FPN  | pytorch |   3x    |         300         |    True     |    True    |  45.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py)  |   [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.log.json)   |
+| Sparse R-CNN | R-101-FPN | pytorch |   3x    |         100         |    True     |   False    |  44.2  |          [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py)           |                                       [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.log.json)                                       |
+| Sparse R-CNN | R-101-FPN | pytorch |   3x    |         300         |    True     |    True    |  46.2  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.log.json) |
+
+### Notes
+
+We observe about 0.3 AP noise especially when using ResNet-101 as the backbone.
+
+## Citation
+
+```latex
+@article{peize2020sparse,
+  title   =  {{SparseR-CNN}: End-to-End Object Detection with Learnable Proposals},
+  author  =  {Peize Sun and Rufeng Zhang and Yi Jiang and Tao Kong and Chenfeng Xu and Wei Zhan and Masayoshi Tomizuka and Lei Li and Zehuan Yuan and Changhu Wang and Ping Luo},
+  journal =  {arXiv preprint arXiv:2011.12450},
+  year    =  {2020}
+}
+```
diff --git a/configs/sparse_rcnn/metafile.yml b/configs/sparse_rcnn/metafile.yml
new file mode 100755
index 0000000..bb1273e
--- /dev/null
+++ b/configs/sparse_rcnn/metafile.yml
@@ -0,0 +1,80 @@
+Collections:
+  - Name: Sparse R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - Sparse R-CNN
+    Paper:
+      URL: https://arxiv.org/abs/2011.12450
+      Title: 'Sparse R-CNN: End-to-End Object Detection with Learnable Proposals'
+    README: configs/sparse_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/sparse_rcnn.py#L6
+      Version: v2.9.0
+
+Models:
+  - Name: sparse_rcnn_r50_fpn_1x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth
+
+  - Name: sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.pth
+
+  - Name: sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.pth
+
+  - Name: sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.pth
+
+  - Name: sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.pth
diff --git a/configs/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..de323bd
--- /dev/null
+++ b/configs/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py b/configs/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..ab4c5f6
--- /dev/null
+++ b/configs/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py b/configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..b383ee4
--- /dev/null
+++ b/configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,95 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+num_stages = 6
+num_proposals = 100
+model = dict(
+    type='SparseRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        add_extra_convs='on_input',
+        num_outs=4),
+    rpn_head=dict(
+        type='EmbeddingRPNHead',
+        num_proposals=num_proposals,
+        proposal_feature_channel=256),
+    roi_head=dict(
+        type='SparseRoIHead',
+        num_stages=num_stages,
+        stage_loss_weights=[1] * num_stages,
+        proposal_feature_channel=256,
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='DIIHead',
+                num_classes=80,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_cls_fcs=1,
+                num_reg_fcs=3,
+                feedforward_channels=2048,
+                in_channels=256,
+                dropout=0.0,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                dynamic_conv_cfg=dict(
+                    type='DynamicConv',
+                    in_channels=256,
+                    feat_channels=64,
+                    out_channels=256,
+                    input_feat_shape=7,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN')),
+                loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                loss_cls=dict(
+                    type='FocalLoss',
+                    use_sigmoid=True,
+                    gamma=2.0,
+                    alpha=0.25,
+                    loss_weight=2.0),
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    clip_border=False,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages)
+        ]),
+    # training and testing settings
+    train_cfg=dict(
+        rpn=None,
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    cls_cost=dict(type='FocalLossCost', weight=2.0),
+                    reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                    iou_cost=dict(type='IoUCost', iou_mode='giou',
+                                  weight=2.0)),
+                sampler=dict(type='PseudoSampler'),
+                pos_weight=1) for _ in range(num_stages)
+        ]),
+    test_cfg=dict(rpn=None, rcnn=dict(max_per_img=num_proposals)))
+
+# optimizer
+optimizer = dict(_delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001)
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=1, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py b/configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..36f1d62
--- /dev/null
+++ b/configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
@@ -0,0 +1,52 @@
+_base_ = './sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py'
+num_proposals = 300
+model = dict(
+    rpn_head=dict(num_proposals=num_proposals),
+    test_cfg=dict(
+        _delete_=True, rpn=None, rcnn=dict(max_per_img=num_proposals)))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# augmentation strategy originates from DETR.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py b/configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py
new file mode 100755
index 0000000..2fa2a80
--- /dev/null
+++ b/configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py
@@ -0,0 +1,23 @@
+_base_ = './sparse_rcnn_r50_fpn_1x_coco.py'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+min_values = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, value) for value in min_values],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+data = dict(train=dict(pipeline=train_pipeline))
+lr_config = dict(policy='step', step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/ssd/README.md b/configs/ssd/README.md
new file mode 100755
index 0000000..463926b
--- /dev/null
+++ b/configs/ssd/README.md
@@ -0,0 +1,62 @@
+# SSD
+
+> [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a method for detecting objects in images using a single deep neural network. Our approach, named SSD, discretizes the output space of bounding boxes into a set of default boxes over different aspect ratios and scales per feature map location. At prediction time, the network generates scores for the presence of each object category in each default box and produces adjustments to the box to better match the object shape. Additionally, the network combines predictions from multiple feature maps with different resolutions to naturally handle objects of various sizes. Our SSD model is simple relative to methods that require object proposals because it completely eliminates proposal generation and subsequent pixel or feature resampling stage and encapsulates all computation in a single network. This makes SSD easy to train and straightforward to integrate into systems that require a detection component. Experimental results on the PASCAL VOC, MS COCO, and ILSVRC datasets confirm that SSD has comparable accuracy to methods that utilize an additional object proposal step and is much faster, while providing a unified framework for both training and inference. Compared to other single stage methods, SSD has much better accuracy, even with a smaller input image size. For 300×300 input, SSD achieves 72.1% mAP on VOC2007 test at 58 FPS on a Nvidia Titan X and for 500×500 input, SSD achieves 75.1% mAP, outperforming a comparable state of the art Faster R-CNN model.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143998553-4e12f681-6025-46b4-8410-9e2e1e53a8ec.png"/>
+</div>
+
+## Results and models of SSD
+
+| Backbone | Size | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                           Config                                           |                                                                                                             Download                                                                                                             |
+| :------: | :--: | :---: | :-----: | :------: | :------------: | :----: | :----------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  VGG16   | 300  | caffe |  120e   |   9.9    |      43.7      |  25.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ssd/ssd300_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428.log.json) |
+|  VGG16   | 512  | caffe |  120e   |   19.4   |      30.7      |  29.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ssd/ssd512_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849-0a47a1ca.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849.log.json) |
+
+## Results and models of SSD-Lite
+
+|  Backbone   | Size | Training from scratch | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                        Config                                                        |                                                                                                                                                                 Download                                                                                                                                                                 |
+| :---------: | :--: | :-------------------: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| MobileNetV2 | 320  |          yes          |  600e   |   4.0    |      69.9      |  21.3  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627.log.json) |
+
+## Notice
+
+### Compatibility
+
+In v2.14.0, [PR5291](https://github.com/open-mmlab/mmdetection/pull/5291) refactored SSD neck and head for more
+flexible usage. If users want to use the SSD checkpoint trained in the older versions, we provide a scripts
+`tools/model_converters/upgrade_ssd_version.py` to convert the model weights.
+
+```bash
+python tools/model_converters/upgrade_ssd_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH}
+
+```
+
+- OLD_MODEL_PATH: the path to load the old version SSD model.
+- NEW_MODEL_PATH: the path to save the converted model weights.
+
+### SSD-Lite training settings
+
+There are some differences between our implementation of MobileNetV2 SSD-Lite and the one in [TensorFlow 1.x detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md) .
+
+1. Use 320x320 as input size instead of 300x300.
+2. The anchor sizes are different.
+3. The C4 feature map is taken from the last layer of stage 4 instead of the middle of the block.
+4. The model in TensorFlow1.x is trained on coco 2014 and validated on coco minival2014, but we trained and validated the model on coco 2017. The mAP on val2017 is usually a little lower than minival2014 (refer to the results in TensorFlow Object Detection API, e.g., MobileNetV2 SSD gets 22 mAP on minival2014 but 20.2 mAP on val2017).
+
+## Citation
+
+```latex
+@article{Liu_2016,
+   title={SSD: Single Shot MultiBox Detector},
+   journal={ECCV},
+   author={Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.},
+   year={2016},
+}
+```
diff --git a/configs/ssd/ascend_ssd300_coco.py b/configs/ssd/ascend_ssd300_coco.py
new file mode 100755
index 0000000..25457ee
--- /dev/null
+++ b/configs/ssd/ascend_ssd300_coco.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../_base_/models/ascend_ssd300.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict(_delete_=True)
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/ssd/metafile.yml b/configs/ssd/metafile.yml
new file mode 100755
index 0000000..b9ee79c
--- /dev/null
+++ b/configs/ssd/metafile.yml
@@ -0,0 +1,78 @@
+Collections:
+  - Name: SSD
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - VGG
+    Paper:
+      URL: https://arxiv.org/abs/1512.02325
+      Title: 'SSD: Single Shot MultiBox Detector'
+    README: configs/ssd/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.14.0/mmdet/models/dense_heads/ssd_head.py#L16
+      Version: v2.14.0
+
+Models:
+  - Name: ssd300_coco
+    In Collection: SSD
+    Config: configs/ssd/ssd300_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 22.88
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (300, 300)
+      Epochs: 120
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 25.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth
+
+  - Name: ssd512_coco
+    In Collection: SSD
+    Config: configs/ssd/ssd512_coco.py
+    Metadata:
+      Training Memory (GB): 19.4
+      inference time (ms/im):
+        - value: 32.57
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (512, 512)
+      Epochs: 120
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 29.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849-0a47a1ca.pth
+
+  - Name: ssdlite_mobilenetv2_scratch_600e_coco
+    In Collection: SSD
+    Config: configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 14.3
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (320, 320)
+      Epochs: 600
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 21.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth
diff --git a/configs/ssd/ssd300_coco.py b/configs/ssd/ssd300_coco.py
new file mode 100755
index 0000000..1891bad
--- /dev/null
+++ b/configs/ssd/ssd300_coco.py
@@ -0,0 +1,71 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict(_delete_=True)
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/ssd/ssd300_fp16_coco.py b/configs/ssd/ssd300_fp16_coco.py
new file mode 100755
index 0000000..7c53af4
--- /dev/null
+++ b/configs/ssd/ssd300_fp16_coco.py
@@ -0,0 +1,9 @@
+_base_ = ['./ssd300_coco.py']
+
+fp16 = dict(loss_scale='dynamic')
+
+# learning policy
+# In order to avoid non-convergence in the early stage of
+# mixed-precision training, the warmup in the lr_config is set to linear,
+# warmup_iters increases and warmup_ratio decreases.
+lr_config = dict(warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 10)
diff --git a/configs/ssd/ssd512_coco.py b/configs/ssd/ssd512_coco.py
new file mode 100755
index 0000000..117777f
--- /dev/null
+++ b/configs/ssd/ssd512_coco.py
@@ -0,0 +1,84 @@
+_base_ = 'ssd300_coco.py'
+input_size = 512
+model = dict(
+    neck=dict(
+        out_channels=(512, 1024, 512, 256, 256, 256, 256),
+        level_strides=(2, 2, 2, 2, 1),
+        level_paddings=(1, 1, 1, 1, 1),
+        last_kernel_size=4),
+    bbox_head=dict(
+        in_channels=(512, 1024, 512, 256, 256, 256, 256),
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.1, 0.9),
+            strides=[8, 16, 32, 64, 128, 256, 512],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]])))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(512, 512), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(512, 512),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict(_delete_=True)
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/ssd/ssd512_fp16_coco.py b/configs/ssd/ssd512_fp16_coco.py
new file mode 100755
index 0000000..a74434e
--- /dev/null
+++ b/configs/ssd/ssd512_fp16_coco.py
@@ -0,0 +1,9 @@
+_base_ = ['./ssd512_coco.py']
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
+
+# learning policy
+# In order to avoid non-convergence in the early stage of
+# mixed-precision training, the warmup in the lr_config is set to linear,
+# warmup_iters increases and warmup_ratio decreases.
+lr_config = dict(warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 10)
diff --git a/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py b/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py
new file mode 100755
index 0000000..929eb6c
--- /dev/null
+++ b/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py
@@ -0,0 +1,150 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='SingleStageDetector',
+    backbone=dict(
+        type='MobileNetV2',
+        out_indices=(4, 7),
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+    neck=dict(
+        type='SSDNeck',
+        in_channels=(96, 1280),
+        out_channels=(96, 1280, 512, 256, 256, 128),
+        level_strides=(2, 2, 2, 2),
+        level_paddings=(1, 1, 1, 1),
+        l2_norm_scale=None,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        act_cfg=dict(type='ReLU6'),
+        init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+    bbox_head=dict(
+        type='SSDHead',
+        in_channels=(96, 1280, 512, 256, 256, 128),
+        num_classes=80,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        act_cfg=dict(type='ReLU6'),
+        init_cfg=dict(type='Normal', layer='Conv2d', std=0.001),
+
+        # set anchor size manually instead of using the predefined
+        # SSD300 setting.
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            strides=[16, 32, 64, 107, 160, 320],
+            ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
+            min_sizes=[48, 100, 150, 202, 253, 304],
+            max_sizes=[100, 150, 202, 253, 304, 320]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        nms=dict(type='nms', iou_threshold=0.45),
+        min_bbox_size=0,
+        score_thr=0.02,
+        max_per_img=200))
+cudnn_benchmark = True
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=320),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(320, 320),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=320),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=24,
+    workers_per_gpu=4,
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',  # use RepeatDataset to speed up training
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5)
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    min_lr=0)
+runner = dict(type='EpochBasedRunner', max_epochs=120)
+
+# Avoid evaluation and saving weights too frequently
+evaluation = dict(interval=5, metric='bbox')
+checkpoint_config = dict(interval=5)
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (24 samples per GPU)
+auto_scale_lr = dict(base_batch_size=192)
diff --git a/configs/strong_baselines/README.md b/configs/strong_baselines/README.md
new file mode 100755
index 0000000..aa2550d
--- /dev/null
+++ b/configs/strong_baselines/README.md
@@ -0,0 +1,20 @@
+# Strong Baselines
+
+<!-- [OTHERS] -->
+
+We train Mask R-CNN with large-scale jitter and longer schedule as strong baselines.
+The modifications follow those in [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/configs/new_baselines).
+
+## Results and Models
+
+| Backbone |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP |                                  Config                                   |         Download         |
+| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------: | :----------------------: |
+| R-50-FPN | pytorch |   50e   |          |                |        |         |    [config](./mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py)     | [model](<>) \| [log](<>) |
+| R-50-FPN | pytorch |  100e   |          |                |        |         |    [config](./mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py)    | [model](<>) \| [log](<>) |
+| R-50-FPN |  caffe  |  100e   |          |                |  44.7  |  40.4   | [config](./mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py) | [model](<>) \| [log](<>) |
+| R-50-FPN |  caffe  |  400e   |          |                |        |         | [config](./mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py) | [model](<>) \| [log](<>) |
+
+## Notice
+
+When using large-scale jittering, there are sometimes empty proposals in the box and mask heads during training.
+This requires MMSyncBN that allows empty tensors. Therefore, please use mmcv-full>=1.3.14 to train models supported in this directory.
diff --git a/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py b/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py
new file mode 100755
index 0000000..a40d6a0
--- /dev/null
+++ b/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py
@@ -0,0 +1,80 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../common/lsj_100e_coco_instance.py'
+]
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed
+# Requires MMCV-full after  https://github.com/open-mmlab/mmcv/pull/1205.
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    backbone=dict(
+        frozen_stages=-1,
+        norm_eval=False,
+        norm_cfg=norm_cfg,
+        init_cfg=None,
+        style='caffe'),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
+
+file_client_args = dict(backend='disk')
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+image_size = (1024, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(0.1, 2.0),
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=image_size),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+# Use RepeatDataset to speed up training
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py b/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py
new file mode 100755
index 0000000..31824eb
--- /dev/null
+++ b/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py
@@ -0,0 +1,2 @@
+_base_ = 'mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py'
+fp16 = dict(loss_scale=512.)
diff --git a/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py b/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py
new file mode 100755
index 0000000..1211925
--- /dev/null
+++ b/configs/strong_baselines/mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_400e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_r50_caffe_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py'
+
+# Use RepeatDataset to speed up training
+# change repeat time from 4 (for 100 epochs) to 16 (for 400 epochs)
+data = dict(train=dict(times=4 * 4))
+lr_config = dict(warmup_iters=500 * 4)
diff --git a/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py b/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py
new file mode 100755
index 0000000..4a15d69
--- /dev/null
+++ b/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../common/lsj_100e_coco_instance.py'
+]
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed
+# Requires MMCV-full after  https://github.com/open-mmlab/mmcv/pull/1205.
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    # the model is trained from scratch, so init_cfg is None
+    backbone=dict(
+        frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),  # leads to 0.1+ mAP
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
diff --git a/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py b/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py
new file mode 100755
index 0000000..7b97960
--- /dev/null
+++ b/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_fp16_coco.py
@@ -0,0 +1,3 @@
+_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py'
+# use FP16
+fp16 = dict(loss_scale=512.)
diff --git a/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py b/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py
new file mode 100755
index 0000000..922579a
--- /dev/null
+++ b/configs/strong_baselines/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_50e_coco.py
@@ -0,0 +1,5 @@
+_base_ = 'mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_lsj_100e_coco.py'
+
+# Use RepeatDataset to speed up training
+# change repeat time from 4 (for 100 epochs) to 2 (for 50 epochs)
+data = dict(train=dict(times=2))
diff --git a/configs/swin/README.md b/configs/swin/README.md
new file mode 100755
index 0000000..2136134
--- /dev/null
+++ b/configs/swin/README.md
@@ -0,0 +1,41 @@
+# Swin
+
+> [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
+
+<!-- [BACKBONE] -->
+
+## Abstract
+
+This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143999551-6a527048-de38-485c-a1b6-3133ffa5bfaa.png"/>
+</div>
+
+## Results and Models
+
+### Mask R-CNN
+
+| Backbone |  Pretrain   | Lr schd | Multi-scale crop | FP16 | Mem (GB) | Inf time (fps) | box AP | mask AP |                             Config                             |                                                                                                                                                                                      Download                                                                                                                                                                                       |
+| :------: | :---------: | :-----: | :--------------: | :--: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Swin-T  | ImageNet-1K |   1x    |        no        |  no  |   7.6    |                |  42.7  |  39.3   |       [config](./mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py)        |                           [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth)  \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937.log.json)                           |
+|  Swin-T  | ImageNet-1K |   3x    |       yes        |  no  |   10.2   |                |  46.0  |  41.6   |   [config](./mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py)    |           [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth)  \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725.log.json)           |
+|  Swin-T  | ImageNet-1K |   3x    |       yes        | yes  |   7.8    |                |  46.0  |  41.7   | [config](./mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth)  \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006.log.json) |
+|  Swin-S  | ImageNet-1K |   3x    |       yes        | yes  |   11.9   |                |  48.2  |  43.2   | [config](./mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808-b92c91f1.pth)  \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808.log.json) |
+
+### Notice
+
+Please follow the example
+of `retinanet_swin-t-p4-w7_fpn_1x_coco.py` when you want to combine Swin Transformer with
+the one-stage detector. Because there is a layer norm at the outs of Swin Transformer, you must set `start_level` as 0 in FPN, so we have to set the `out_indices` of backbone as `[1,2,3]`.
+
+## Citation
+
+```latex
+@article{liu2021Swin,
+    title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
+    author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+    journal={arXiv preprint arXiv:2103.14030},
+    year={2021}
+}
+```
diff --git a/configs/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py b/configs/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py
new file mode 100755
index 0000000..15d50a0
--- /dev/null
+++ b/configs/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py'
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth'  # noqa
+model = dict(
+    backbone=dict(
+        depths=[2, 2, 18, 2],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
diff --git a/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py b/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py
new file mode 100755
index 0000000..337e858
--- /dev/null
+++ b/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py
@@ -0,0 +1,42 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+model = dict(
+    type='MaskRCNN',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[96, 192, 384, 768]))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0001,
+    betas=(0.9, 0.999),
+    weight_decay=0.05,
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+lr_config = dict(warmup_iters=1000, step=[8, 11])
+runner = dict(max_epochs=12)
diff --git a/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py b/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py
new file mode 100755
index 0000000..2be3114
--- /dev/null
+++ b/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py'
+# you need to set mode='dynamic' if you are using pytorch<=1.5.0
+fp16 = dict(loss_scale=dict(init_scale=512))
diff --git a/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py b/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py
new file mode 100755
index 0000000..2612f6e
--- /dev/null
+++ b/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py
@@ -0,0 +1,91 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+
+model = dict(
+    type='MaskRCNN',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[96, 192, 384, 768]))
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0001,
+    betas=(0.9, 0.999),
+    weight_decay=0.05,
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+lr_config = dict(warmup_iters=1000, step=[27, 33])
+runner = dict(max_epochs=36)
diff --git a/configs/swin/metafile.yml b/configs/swin/metafile.yml
new file mode 100755
index 0000000..6c07f17
--- /dev/null
+++ b/configs/swin/metafile.yml
@@ -0,0 +1,120 @@
+Models:
+  - Name: mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 11.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 43.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808-b92c91f1.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.2
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: mask_rcnn_swin-t-p4-w7_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
diff --git a/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py b/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py
new file mode 100755
index 0000000..3315093
--- /dev/null
+++ b/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[192, 384, 768], start_level=0, num_outs=5))
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/timm_example/README.md b/configs/timm_example/README.md
new file mode 100755
index 0000000..4374855
--- /dev/null
+++ b/configs/timm_example/README.md
@@ -0,0 +1,62 @@
+# Timm Example
+
+> [PyTorch Image Models](https://github.com/rwightman/pytorch-image-models)
+
+<!-- [OTHERS] -->
+
+## Abstract
+
+Py**T**orch **Im**age **M**odels (`timm`) is a collection of image models, layers, utilities, optimizers, schedulers, data-loaders / augmentations, and reference training / validation scripts that aim to pull together a wide variety of SOTA models with ability to reproduce ImageNet training results.
+
+<!--
+<div align=center>
+<img src="" height="400" />
+</div>
+-->
+
+## Results and Models
+
+### RetinaNet
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP |                          Config                           | Download |
+| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------: | :------: |
+|      R-50       | pytorch |   1x    |          |                |        |   [config](./retinanet_timm_tv_resnet50_fpn_1x_coco.py)   |          |
+| EfficientNet-B1 |    -    |   1x    |          |                |        | [config](./retinanet_timm_efficientnet_b1_fpn_1x_coco.py) |          |
+
+## Usage
+
+### Install additional requirements
+
+MMDetection supports timm backbones via `TIMMBackbone`, a wrapper class in MMClassification.
+Thus, you need to install `mmcls` in addition to timm.
+If you have already installed requirements for mmdet, run
+
+```shell
+pip install 'dataclasses; python_version<"3.7"'
+pip install timm
+pip install 'mmcls>=0.20.0'
+```
+
+See [this document](https://mmclassification.readthedocs.io/en/latest/install.html) for the details of MMClassification installation.
+
+### Edit config
+
+- See example configs for basic usage.
+- See the documents of [timm feature extraction](https://rwightman.github.io/pytorch-image-models/feature_extraction/#multi-scale-feature-maps-feature-pyramid) and [TIMMBackbone](https://mmclassification.readthedocs.io/en/latest/api.html#mmcls.models.backbones.TIMMBackbone) for details.
+- Which feature map is output depends on the backbone.
+  Please check `backbone out_channels` and `backbone out_strides` in your log, and modify `model.neck.in_channels` and `model.backbone.out_indices` if necessary.
+- If you use Vision Transformer models that do not support `features_only=True`, add `custom_hooks = []` to your config to disable `NumClassCheckHook`.
+
+## Citation
+
+```latex
+@misc{rw2019timm,
+  author = {Ross Wightman},
+  title = {PyTorch Image Models},
+  year = {2019},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  doi = {10.5281/zenodo.4414861},
+  howpublished = {\url{https://github.com/rwightman/pytorch-image-models}}
+}
+```
diff --git a/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py b/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
new file mode 100755
index 0000000..6500116
--- /dev/null
+++ b/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
@@ -0,0 +1,20 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmcls.TIMMBackbone',
+        model_name='efficientnet_b1',
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)),
+    neck=dict(in_channels=[24, 40, 112, 320]))
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py b/configs/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py
new file mode 100755
index 0000000..0c5b7a8
--- /dev/null
+++ b/configs/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmcls.TIMMBackbone',
+        model_name='tv_resnet50',  # ResNet-50 with torchvision weights
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)))
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/tood/README.md b/configs/tood/README.md
new file mode 100755
index 0000000..925f0ed
--- /dev/null
+++ b/configs/tood/README.md
@@ -0,0 +1,40 @@
+# TOOD
+
+> [TOOD: Task-aligned One-stage Object Detection](https://arxiv.org/abs/2108.07755)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+One-stage object detection is commonly implemented by optimizing two sub-tasks: object classification and localization, using heads with two parallel branches, which might lead to a certain level of spatial misalignment in predictions between the two tasks. In this work, we propose a Task-aligned One-stage Object Detection (TOOD) that explicitly aligns the two tasks in a learning-based manner. First, we design a novel Task-aligned Head (T-Head) which offers a better balance between learning task-interactive and task-specific features, as well as a greater flexibility to learn the alignment via a task-aligned predictor. Second, we propose Task Alignment Learning (TAL) to explicitly pull closer (or even unify) the optimal anchors for the two tasks during training via a designed sample assignment scheme and a task-aligned loss. Extensive experiments are conducted on MS-COCO, where TOOD achieves a 51.1 AP at single-model single-scale testing. This surpasses the recent one-stage detectors by a large margin, such as ATSS (47.7 AP), GFL (48.2 AP), and PAA (49.0 AP), with fewer parameters and FLOPs. Qualitative results also demonstrate the effectiveness of TOOD for better aligning the tasks of object classification and localization.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/12907710/145400075-e08191f5-8afa-4335-9b3b-27926fc9a26e.png"/>
+</div>
+
+## Results and Models
+
+|     Backbone      |  Style  | Anchor Type  | Lr schd | Multi-scale Training | Mem (GB) | Inf time (fps) | box AP |                             Config                             |                                                                                                                                                                       Download                                                                                                                                                                        |
+| :---------------: | :-----: | :----------: | :-----: | :------------------: | :------: | :------------: | :----: | :------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       R-50        | pytorch | Anchor-free  |   1x    |          N           |   4.1    |                |  42.4  |              [config](./tood_r50_fpn_1x_coco.py)               |                                           [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425.log)                                           |
+|       R-50        | pytorch | Anchor-based |   1x    |          N           |   4.1    |                |  42.4  |        [config](./tood_r50_fpn_anchor_based_1x_coco.py)        |                 [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105-b776c134.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105.log)                 |
+|       R-50        | pytorch | Anchor-free  |   2x    |          Y           |   4.1    |                |  44.5  |          [config](./tood_r50_fpn_mstrain_2x_coco.py)           |                           [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231-3b23174c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231.log)                           |
+|       R-101       | pytorch | Anchor-free  |   2x    |          Y           |   6.0    |                |  46.1  |          [config](./tood_r101_fpn_mstrain_2x_coco.py)          |                         [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232-a18f53c8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232.log)                         |
+|    R-101-dcnv2    | pytorch | Anchor-free  |   2x    |          Y           |   6.2    |                |  49.3  |    [config](./tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py)    | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728-4a824142.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728.log) |
+|    X-101-64x4d    | pytorch | Anchor-free  |   2x    |          Y           |   10.2   |                |  47.6  |       [config](./tood_x101_64x4d_fpn_mstrain_2x_coco.py)       |             [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519-a4f36113.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519.log)             |
+| X-101-64x4d-dcnv2 | pytorch | Anchor-free  |   2x    |          Y           |          |                |        | [config](./tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py) |                                                                                                                                                               [model](<>) \| [log](<>)                                                                                                                                                                |
+
+\[1\] *1x and 2x mean the model is trained for 90K and 180K iterations, respectively.* \
+\[2\] *All results are obtained with a single model and without any test time data augmentation such as multi-scale, flipping and etc..* \
+\[3\] *`dcnv2` denotes deformable convolutional networks v2.* \\
+
+## Citation
+
+```latex
+@inproceedings{feng2021tood,
+    title={TOOD: Task-aligned One-stage Object Detection},
+    author={Feng, Chengjian and Zhong, Yujie and Gao, Yu and Scott, Matthew R and Huang, Weilin},
+    booktitle={ICCV},
+    year={2021}
+}
+```
diff --git a/configs/tood/metafile.yml b/configs/tood/metafile.yml
new file mode 100755
index 0000000..27a0f8d
--- /dev/null
+++ b/configs/tood/metafile.yml
@@ -0,0 +1,95 @@
+Collections:
+  - Name: TOOD
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - TOOD
+    Paper:
+      URL: https://arxiv.org/abs/2108.07755
+      Title: 'TOOD: Task-aligned One-stage Object Detection'
+    README: configs/tood/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.20.0/mmdet/models/detectors/tood.py#L7
+      Version: v2.20.0
+
+Models:
+  - Name: tood_r101_fpn_mstrain_2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r101_fpn_mstrain_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232-a18f53c8.pth
+
+  - Name: tood_x101_64x4d_fpn_mstrain_2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.2
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519-a4f36113.pth
+
+  - Name: tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728-4a824142.pth
+
+  - Name: tood_r50_fpn_anchor_based_1x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r50_fpn_anchor_based_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105-b776c134.pth
+
+  - Name: tood_r50_fpn_1x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth
+
+  - Name: tood_r50_fpn_mstrain_2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r50_fpn_mstrain_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231-3b23174c.pth
diff --git a/configs/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py b/configs/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..c7f1bbc
--- /dev/null
+++ b/configs/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './tood_r101_fpn_mstrain_2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    bbox_head=dict(num_dcn=2))
diff --git a/configs/tood/tood_r101_fpn_mstrain_2x_coco.py b/configs/tood/tood_r101_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..d9d2c32
--- /dev/null
+++ b/configs/tood/tood_r101_fpn_mstrain_2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './tood_r50_fpn_mstrain_2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/tood/tood_r50_fpn_1x_coco.py b/configs/tood/tood_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..35a77a4
--- /dev/null
+++ b/configs/tood/tood_r50_fpn_1x_coco.py
@@ -0,0 +1,74 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='TOOD',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='TOODHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=6,
+        feat_channels=256,
+        anchor_type='anchor_free',
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        initial_loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0)),
+    train_cfg=dict(
+        initial_epoch=4,
+        initial_assigner=dict(type='ATSSAssigner', topk=9),
+        assigner=dict(type='TaskAlignedAssigner', topk=13),
+        alpha=1,
+        beta=6,
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+
+# custom hooks
+custom_hooks = [dict(type='SetEpochInfoHook')]
diff --git a/configs/tood/tood_r50_fpn_anchor_based_1x_coco.py b/configs/tood/tood_r50_fpn_anchor_based_1x_coco.py
new file mode 100755
index 0000000..c7fbf6a
--- /dev/null
+++ b/configs/tood/tood_r50_fpn_anchor_based_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './tood_r50_fpn_1x_coco.py'
+model = dict(bbox_head=dict(anchor_type='anchor_based'))
diff --git a/configs/tood/tood_r50_fpn_mstrain_2x_coco.py b/configs/tood/tood_r50_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..157d13a
--- /dev/null
+++ b/configs/tood/tood_r50_fpn_mstrain_2x_coco.py
@@ -0,0 +1,22 @@
+_base_ = './tood_r50_fpn_1x_coco.py'
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+# multi-scale training
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 480), (1333, 800)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/tood/tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py b/configs/tood/tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..47c9269
--- /dev/null
+++ b/configs/tood/tood_x101_64x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './tood_x101_64x4d_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True),
+    ),
+    bbox_head=dict(num_dcn=2))
diff --git a/configs/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py b/configs/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..842f320
--- /dev/null
+++ b/configs/tood/tood_x101_64x4d_fpn_mstrain_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './tood_r50_fpn_mstrain_2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/tridentnet/README.md b/configs/tridentnet/README.md
new file mode 100755
index 0000000..b972b3a
--- /dev/null
+++ b/configs/tridentnet/README.md
@@ -0,0 +1,38 @@
+# TridentNet
+
+> [Scale-Aware Trident Networks for Object Detection](https://arxiv.org/abs/1901.01892)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Scale variation is one of the key challenges in object detection. In this work, we first present a controlled experiment to investigate the effect of receptive fields for scale variation in object detection. Based on the findings from the exploration experiments, we propose a novel Trident Network (TridentNet) aiming to generate scale-specific feature maps with a uniform representational power. We construct a parallel multi-branch architecture in which each branch shares the same transformation parameters but with different receptive fields. Then, we adopt a scale-aware training scheme to specialize each branch by sampling object instances of proper scales for training. As a bonus, a fast approximation version of TridentNet could achieve significant improvements without any additional parameters and computational cost compared with the vanilla detector. On the COCO dataset, our TridentNet with ResNet-101 backbone achieves state-of-the-art single-model results of 48.4 mAP.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143999668-0927922e-efc2-45fa-8bfc-1e3df18720f5.png"/>
+</div>
+
+## Results and Models
+
+We reports the test results using only one branch for inference.
+
+| Backbone | Style | mstrain | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                                                                                                                                      Download                                                                                                                                                                      |
+| :------: | :---: | :-----: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|   R-50   | caffe |    N    |   1x    |          |                |  37.7  |                 [model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838.log.json)                 |
+|   R-50   | caffe |    Y    |   1x    |          |                |  37.6  | [model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839-6ce55ccb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839.log.json) |
+|   R-50   | caffe |    Y    |   3x    |          |                |  40.3  | [model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539-46d227ba.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539.log.json) |
+
+**Note**
+
+Similar to [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/projects/TridentNet), we haven't implemented the Scale-aware Training Scheme in section 4.2 of the paper.
+
+## Citation
+
+```latex
+@InProceedings{li2019scale,
+  title={Scale-Aware Trident Networks for Object Detection},
+  author={Li, Yanghao and Chen, Yuntao and Wang, Naiyan and Zhang, Zhaoxiang},
+  journal={The International Conference on Computer Vision (ICCV)},
+  year={2019}
+}
+```
diff --git a/configs/tridentnet/metafile.yml b/configs/tridentnet/metafile.yml
new file mode 100755
index 0000000..2536f97
--- /dev/null
+++ b/configs/tridentnet/metafile.yml
@@ -0,0 +1,55 @@
+Collections:
+  - Name: TridentNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - TridentNet Block
+    Paper:
+      URL: https://arxiv.org/abs/1901.01892
+      Title: 'Scale-Aware Trident Networks for Object Detection'
+    README: configs/tridentnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/detectors/trident_faster_rcnn.py#L6
+      Version: v2.8.0
+
+Models:
+  - Name: tridentnet_r50_caffe_1x_coco
+    In Collection: TridentNet
+    Config: configs/tridentnet/tridentnet_r50_caffe_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth
+
+  - Name: tridentnet_r50_caffe_mstrain_1x_coco
+    In Collection: TridentNet
+    Config: configs/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839-6ce55ccb.pth
+
+  - Name: tridentnet_r50_caffe_mstrain_3x_coco
+    In Collection: TridentNet
+    Config: configs/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539-46d227ba.pth
diff --git a/configs/tridentnet/tridentnet_r50_caffe_1x_coco.py b/configs/tridentnet/tridentnet_r50_caffe_1x_coco.py
new file mode 100755
index 0000000..d779f75
--- /dev/null
+++ b/configs/tridentnet/tridentnet_r50_caffe_1x_coco.py
@@ -0,0 +1,55 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_caffe_c4.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='TridentFasterRCNN',
+    backbone=dict(
+        type='TridentResNet',
+        trident_dilations=(1, 2, 3),
+        num_branch=3,
+        test_branch_idx=1,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    roi_head=dict(type='TridentRoIHead', num_branch=3, test_branch_idx=1),
+    train_cfg=dict(
+        rpn_proposal=dict(max_per_img=500),
+        rcnn=dict(
+            sampler=dict(num=128, pos_fraction=0.5,
+                         add_gt_as_proposals=False))))
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py b/configs/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py
new file mode 100755
index 0000000..c73d9ea
--- /dev/null
+++ b/configs/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = 'tridentnet_r50_caffe_1x_coco.py'
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+data = dict(train=dict(pipeline=train_pipeline))
diff --git a/configs/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py b/configs/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py
new file mode 100755
index 0000000..0f40282
--- /dev/null
+++ b/configs/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = 'tridentnet_r50_caffe_mstrain_1x_coco.py'
+
+lr_config = dict(step=[28, 34])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/configs/vfnet/README.md b/configs/vfnet/README.md
new file mode 100755
index 0000000..a492bec
--- /dev/null
+++ b/configs/vfnet/README.md
@@ -0,0 +1,48 @@
+# VarifocalNet
+
+> [VarifocalNet: An IoU-aware Dense Object Detector](https://arxiv.org/abs/2008.13367)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Accurately ranking the vast number of candidate detections is crucial for dense object detectors to achieve high performance. Prior work uses the classification score or a combination of classification and predicted localization scores to rank candidates. However, neither option results in a reliable ranking, thus degrading detection performance. In this paper, we propose to learn an Iou-aware Classification Score (IACS) as a joint representation of object presence confidence and localization accuracy. We show that dense object detectors can achieve a more accurate ranking of candidate detections based on the IACS. We design a new loss function, named Varifocal Loss, to train a dense object detector to predict the IACS, and propose a new star-shaped bounding box feature representation for IACS prediction and bounding box refinement. Combining these two new components and a bounding box refinement branch, we build an IoU-aware dense object detector based on the FCOS+ATSS architecture, that we call VarifocalNet or VFNet for short. Extensive experiments on MS COCO show that our VFNet consistently surpasses the strong baseline by ∼2.0 AP with different backbones. Our best model VFNet-X-1200 with Res2Net-101-DCN achieves a single-model single-scale AP of 55.1 on COCO test-dev, which is state-of-the-art among various object detectors.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/9102141/97464778-4b9ab000-197c-11eb-9283-ab2907ee0252.png"/>
+</div>
+
+## Introduction
+
+**VarifocalNet (VFNet)** learns to predict the IoU-aware classification score which mixes the object presence confidence and localization accuracy together as the detection score for a bounding box. The learning is supervised by the proposed Varifocal Loss (VFL), based on a new star-shaped bounding box feature representation (the features at nine yellow sampling points). Given the new representation, the object localization accuracy is further improved by refining the initially regressed bounding box. The full paper is available at: [https://arxiv.org/abs/2008.13367](https://arxiv.org/abs/2008.13367).
+
+## Results and Models
+
+|  Backbone   |  Style  | DCN | MS train | Lr schd | Inf time (fps) | box AP (val) | box AP (test-dev) |                                                               Config                                                               |                                                                                                                                                                               Download                                                                                                                                                                               |
+| :---------: | :-----: | :-: | :------: | :-----: | :------------: | :----------: | :---------------: | :--------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50     | pytorch |  N  |    N     |   1x    |       -        |     41.6     |       41.6        |               [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r50_fpn_1x_coco.py)               |                                                          [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco.json)                                                           |
+|    R-50     | pytorch |  N  |    Y     |   2x    |       -        |     44.5     |       44.8        |           [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py)           |                                          [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco_20201027-7cc75bd2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco.json)                                           |
+|    R-50     | pytorch |  Y  |    Y     |   2x    |       -        |     47.8     |       48.0        |    [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py)     |               [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-6879c318.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.json)               |
+|    R-101    | pytorch |  N  |    N     |   1x    |       -        |     43.0     |       43.6        |              [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r101_fpn_1x_coco.py)               |                                                       [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco_20201027pth-c831ece7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco.json)                                                       |
+|    R-101    | pytorch |  N  |    Y     |   2x    |       -        |     46.2     |       46.7        |          [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py)           |                                       [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco_20201027pth-4a5d53f1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco.json)                                       |
+|    R-101    | pytorch |  Y  |    Y     |   2x    |       -        |     49.0     |       49.2        |    [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py)    |             [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.json)             |
+| X-101-32x4d | pytorch |  Y  |    Y     |   2x    |       -        |     49.7     |       50.0        | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-d300a6fc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.json) |
+| X-101-64x4d | pytorch |  Y  |    Y     |   2x    |       -        |     50.4     |       50.8        | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-b5f6da5e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.json) |
+
+**Notes:**
+
+- The MS-train scale range is 1333x\[480:960\] (`range` mode) and the inference scale keeps 1333x800.
+- DCN means using `DCNv2` in both backbone and head.
+- Inference time will be updated soon.
+- More results and pre-trained models can be found in [VarifocalNet-Github](https://github.com/hyz-xmaster/VarifocalNet)
+
+## Citation
+
+```latex
+@article{zhang2020varifocalnet,
+  title={VarifocalNet: An IoU-aware Dense Object Detector},
+  author={Zhang, Haoyang and Wang, Ying and Dayoub, Feras and S{\"u}nderhauf, Niko},
+  journal={arXiv preprint arXiv:2008.13367},
+  year={2020}
+}
+```
diff --git a/configs/vfnet/metafile.yml b/configs/vfnet/metafile.yml
new file mode 100755
index 0000000..bcbe576
--- /dev/null
+++ b/configs/vfnet/metafile.yml
@@ -0,0 +1,116 @@
+Collections:
+  - Name: VFNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - Varifocal Loss
+    Paper:
+      URL: https://arxiv.org/abs/2008.13367
+      Title: 'VarifocalNet: An IoU-aware Dense Object Detector'
+    README: configs/vfnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.6.0/mmdet/models/detectors/vfnet.py#L6
+      Version: v2.6.0
+
+Models:
+  - Name: vfnet_r50_fpn_1x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth
+
+  - Name: vfnet_r50_fpn_mstrain_2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco_20201027-7cc75bd2.pth
+
+  - Name: vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-6879c318.pth
+
+  - Name: vfnet_r101_fpn_1x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco_20201027pth-c831ece7.pth
+
+  - Name: vfnet_r101_fpn_mstrain_2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco_20201027pth-4a5d53f1.pth
+
+  - Name: vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth
+
+  - Name: vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-d300a6fc.pth
+
+  - Name: vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-b5f6da5e.pth
diff --git a/configs/vfnet/vfnet_r101_fpn_1x_coco.py b/configs/vfnet/vfnet_r101_fpn_1x_coco.py
new file mode 100755
index 0000000..b296a07
--- /dev/null
+++ b/configs/vfnet/vfnet_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './vfnet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/vfnet/vfnet_r101_fpn_2x_coco.py b/configs/vfnet/vfnet_r101_fpn_2x_coco.py
new file mode 100755
index 0000000..27962f3
--- /dev/null
+++ b/configs/vfnet/vfnet_r101_fpn_2x_coco.py
@@ -0,0 +1,8 @@
+_base_ = './vfnet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..e438c24
--- /dev/null
+++ b/configs/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py b/configs/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..eae69a0
--- /dev/null
+++ b/configs/vfnet/vfnet_r101_fpn_mstrain_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/vfnet/vfnet_r2_101_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/vfnet/vfnet_r2_101_fpn_mdconv_c3-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..815a36e
--- /dev/null
+++ b/configs/vfnet/vfnet_r2_101_fpn_mdconv_c3-c5_mstrain_2x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/configs/vfnet/vfnet_r2_101_fpn_mstrain_2x_coco.py b/configs/vfnet/vfnet_r2_101_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..58022e0
--- /dev/null
+++ b/configs/vfnet/vfnet_r2_101_fpn_mstrain_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/configs/vfnet/vfnet_r50_fpn_1x_coco.py b/configs/vfnet/vfnet_r50_fpn_1x_coco.py
new file mode 100755
index 0000000..7de6429
--- /dev/null
+++ b/configs/vfnet/vfnet_r50_fpn_1x_coco.py
@@ -0,0 +1,107 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='VFNet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='VFNetHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=3,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        center_sampling=False,
+        dcn_on_last_conv=False,
+        use_atss=True,
+        use_vfl=True,
+        loss_cls=dict(
+            type='VarifocalLoss',
+            use_sigmoid=True,
+            alpha=0.75,
+            gamma=2.0,
+            iou_weighted=True,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.5),
+        loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# data setting
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(
+    lr=0.01, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.1,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..24d2093
--- /dev/null
+++ b/configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    bbox_head=dict(dcn_on_last_conv=True))
diff --git a/configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py b/configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..6078bb9
--- /dev/null
+++ b/configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py
@@ -0,0 +1,39 @@
+_base_ = './vfnet_r50_fpn_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 480), (1333, 960)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# learning policy
+lr_config = dict(step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/configs/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..7efa051
--- /dev/null
+++ b/configs/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/vfnet/vfnet_x101_32x4d_fpn_mstrain_2x_coco.py b/configs/vfnet/vfnet_x101_32x4d_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..49a4312
--- /dev/null
+++ b/configs/vfnet/vfnet_x101_32x4d_fpn_mstrain_2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/configs/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py b/configs/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py
new file mode 100755
index 0000000..7e1ee42
--- /dev/null
+++ b/configs/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/vfnet/vfnet_x101_64x4d_fpn_mstrain_2x_coco.py b/configs/vfnet/vfnet_x101_64x4d_fpn_mstrain_2x_coco.py
new file mode 100755
index 0000000..e51064e
--- /dev/null
+++ b/configs/vfnet/vfnet_x101_64x4d_fpn_mstrain_2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './vfnet_r50_fpn_mstrain_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/configs/wider_face/README.md b/configs/wider_face/README.md
new file mode 100755
index 0000000..1904506
--- /dev/null
+++ b/configs/wider_face/README.md
@@ -0,0 +1,57 @@
+# WIDER FACE
+
+> [WIDER FACE: A Face Detection Benchmark](https://arxiv.org/abs/1511.06523)
+
+<!-- [DATASET] -->
+
+## Abstract
+
+Face detection is one of the most studied topics in the computer vision community. Much of the progresses have been made by the availability of face detection benchmark datasets. We show that there is a gap between current face detection performance and the real world requirements. To facilitate future face detection research, we introduce the WIDER FACE dataset, which is 10 times larger than existing datasets. The dataset contains rich annotations, including occlusions, poses, event categories, and face bounding boxes. Faces in the proposed dataset are extremely challenging due to large variations in scale, pose and occlusion, as shown in Fig. 1. Furthermore, we show that WIDER FACE dataset is an effective training source for face detection. We benchmark several representative detection systems, providing an overview of state-of-the-art performance and propose a solution to deal with large scale variation. Finally, we discuss common failure cases that worth to be further investigated.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/144000364-3320de79-34fc-40a6-938f-bb512f05a4bb.png" height="400"/>
+</div>
+
+## Introduction
+
+To use the WIDER Face dataset you need to download it
+and extract to the `data/WIDERFace` folder. Annotation in the VOC format
+can be found in this [repo](https://github.com/sovrasov/wider-face-pascal-voc-annotations.git).
+You should move the annotation files from `WIDER_train_annotations` and `WIDER_val_annotations` folders
+to the `Annotation` folders inside the corresponding directories `WIDER_train` and `WIDER_val`.
+Also annotation lists `val.txt` and `train.txt` should be copied to `data/WIDERFace` from `WIDER_train_annotations` and `WIDER_val_annotations`.
+The directory should be like this:
+
+```
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── WIDERFace
+│   │   ├── WIDER_train
+│   |   │   ├──0--Parade
+│   |   │   ├── ...
+│   |   │   ├── Annotations
+│   │   ├── WIDER_val
+│   |   │   ├──0--Parade
+│   |   │   ├── ...
+│   |   │   ├── Annotations
+│   │   ├── val.txt
+│   │   ├── train.txt
+
+```
+
+After that you can train the SSD300 on WIDER by launching training with the `ssd300_wider_face.py` config or
+create your own config based on the presented one.
+
+## Citation
+
+```latex
+@inproceedings{yang2016wider,
+   Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou},
+   Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+   Title = {WIDER FACE: A Face Detection Benchmark},
+   Year = {2016}
+}
+```
diff --git a/configs/wider_face/ssd300_wider_face.py b/configs/wider_face/ssd300_wider_face.py
new file mode 100755
index 0000000..98d820a
--- /dev/null
+++ b/configs/wider_face/ssd300_wider_face.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/wider_face.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(bbox_head=dict(num_classes=1))
+# optimizer
+optimizer = dict(type='SGD', lr=0.012, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[16, 20])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+log_config = dict(interval=1)
diff --git a/configs/yolact/README.md b/configs/yolact/README.md
new file mode 100755
index 0000000..9eb51b4
--- /dev/null
+++ b/configs/yolact/README.md
@@ -0,0 +1,75 @@
+# YOLACT
+
+> [YOLACT: Real-time Instance Segmentation](https://arxiv.org/abs/1904.02689)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a simple, fully-convolutional model for real-time instance segmentation that achieves 29.8 mAP on MS COCO at 33.5 fps evaluated on a single Titan Xp, which is significantly faster than any previous competitive approach. Moreover, we obtain this result after training on only one GPU. We accomplish this by breaking instance segmentation into two parallel subtasks: (1) generating a set of prototype masks and (2) predicting per-instance mask coefficients. Then we produce instance masks by linearly combining the prototypes with the mask coefficients. We find that because this process doesn't depend on repooling, this approach produces very high-quality masks and exhibits temporal stability for free. Furthermore, we analyze the emergent behavior of our prototypes and show they learn to localize instances on their own in a translation variant manner, despite being fully-convolutional. Finally, we also propose Fast NMS, a drop-in 12 ms faster replacement for standard NMS that only has a marginal performance penalty.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/144001225-8c62cad7-a330-4f60-873f-61aa17e99223.png"/>
+</div>
+
+## Introduction
+
+A simple, fully convolutional model for real-time instance segmentation. This is the code for our paper:
+
+- [YOLACT: Real-time Instance Segmentation](https://arxiv.org/abs/1904.02689)
+
+<!-- - [YOLACT++: Better Real-time Instance Segmentation](https://arxiv.org/abs/1912.06218) -->
+
+For a real-time demo, check out our ICCV video:
+[![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/0pMfmo8qfpQ/0.jpg)](https://www.youtube.com/watch?v=0pMfmo8qfpQ)
+
+## Evaluation
+
+Here are our YOLACT models along with their FPS on a Titan Xp and mAP on COCO's `val`:
+
+| Image Size | GPU x BS |   Backbone    | \*FPS | mAP  | Weights |                                                Configs                                                 |                                                            Download                                                             |
+| :--------: | :------: | :-----------: | :---: | :--: | :-----: | :----------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: |
+|    550     |   1x8    | Resnet50-FPN  | 42.5  | 29.0 |         | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolact/yolact_r50_1x8_coco.py)  |  [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth)  |
+|    550     |   8x8    | Resnet50-FPN  | 42.5  | 28.4 |         | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolact/yolact_r50_8x8_coco.py)  |  [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_8x8_coco/yolact_r50_8x8_coco_20200908-ca34f5db.pth)  |
+|    550     |   1x8    | Resnet101-FPN | 33.5  | 30.4 |         | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolact/yolact_r101_1x8_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r101_1x8_coco/yolact_r101_1x8_coco_20200908-4cbe9101.pth) |
+
+\*Note: The FPS is evaluated by the [original implementation](https://github.com/dbolya/yolact). When calculating FPS, only the model inference time is taken into account. Data loading and post-processing operations such as converting masks to RLE code, generating COCO JSON results, image rendering are not included.
+
+## Training
+
+All the aforementioned models are trained with a single GPU. It typically takes ~12GB VRAM when using resnet-101 as the backbone. If you want to try multiple GPUs training, you may have to modify the configuration files accordingly, such as adjusting the training schedule and freezing batch norm.
+
+```Shell
+# Trains using the resnet-101 backbone with a batch size of 8 on a single GPU.
+./tools/dist_train.sh configs/yolact/yolact_r101.py 1
+```
+
+## Testing
+
+Please refer to [mmdetection/docs/getting_started.md](https://mmdetection.readthedocs.io/en/latest/1_exist_data_model.html#test-existing-models).
+
+## Citation
+
+If you use YOLACT or this code base in your work, please cite
+
+```latex
+@inproceedings{yolact-iccv2019,
+  author    = {Daniel Bolya and Chong Zhou and Fanyi Xiao and Yong Jae Lee},
+  title     = {YOLACT: {Real-time} Instance Segmentation},
+  booktitle = {ICCV},
+  year      = {2019},
+}
+```
+
+<!-- For YOLACT++, please cite
+
+```latex
+@misc{yolact-plus-arxiv2019,
+  title         = {YOLACT++: Better Real-time Instance Segmentation},
+  author        = {Daniel Bolya and Chong Zhou and Fanyi Xiao and Yong Jae Lee},
+  year          = {2019},
+  eprint        = {1912.06218},
+  archivePrefix = {arXiv},
+  primaryClass  = {cs.CV}
+}
+``` -->
diff --git a/configs/yolact/metafile.yml b/configs/yolact/metafile.yml
new file mode 100755
index 0000000..e7019ae
--- /dev/null
+++ b/configs/yolact/metafile.yml
@@ -0,0 +1,78 @@
+Collections:
+  - Name: YOLACT
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.02689
+      Title: 'YOLACT: Real-time Instance Segmentation'
+    README: configs/yolact/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/yolact.py#L9
+      Version: v2.5.0
+
+Models:
+  - Name: yolact_r50_1x8_coco
+    In Collection: YOLACT
+    Config: configs/yolact/yolact_r50_1x8_coco.py
+    Metadata:
+      Training Resources: 1x V100 GPU
+      Batch Size: 8
+      inference time (ms/im):
+        - value: 23.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (550, 550)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 29.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth
+
+  - Name: yolact_r50_8x8_coco
+    In Collection: YOLACT
+    Config: configs/yolact/yolact_r50_8x8_coco.py
+    Metadata:
+      Batch Size: 64
+      inference time (ms/im):
+        - value: 23.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (550, 550)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 28.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_8x8_coco/yolact_r50_8x8_coco_20200908-ca34f5db.pth
+
+  - Name: yolact_r101_1x8_coco
+    In Collection: YOLACT
+    Config: configs/yolact/yolact_r101_1x8_coco.py
+    Metadata:
+      Training Resources: 1x V100 GPU
+      Batch Size: 8
+      inference time (ms/im):
+        - value: 29.85
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (550, 550)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 30.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r101_1x8_coco/yolact_r101_1x8_coco_20200908-4cbe9101.pth
diff --git a/configs/yolact/yolact_r101_1x8_coco.py b/configs/yolact/yolact_r101_1x8_coco.py
new file mode 100755
index 0000000..532631d
--- /dev/null
+++ b/configs/yolact/yolact_r101_1x8_coco.py
@@ -0,0 +1,7 @@
+_base_ = './yolact_r50_1x8_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/yolact/yolact_r50_1x8_coco.py b/configs/yolact/yolact_r50_1x8_coco.py
new file mode 100755
index 0000000..dbced5a
--- /dev/null
+++ b/configs/yolact/yolact_r50_1x8_coco.py
@@ -0,0 +1,165 @@
+_base_ = '../_base_/default_runtime.py'
+
+# model settings
+img_size = 550
+model = dict(
+    type='YOLACT',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,  # do not freeze stem
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,  # update the statistics of bn
+        zero_init_residual=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5,
+        upsample_cfg=dict(mode='bilinear')),
+    bbox_head=dict(
+        type='YOLACTHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=3,
+            scales_per_octave=1,
+            base_sizes=[8, 16, 32, 64, 128],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[550.0 / x for x in [69, 35, 18, 9, 5]],
+            centers=[(550 * 0.5 / x, 550 * 0.5 / x)
+                     for x in [69, 35, 18, 9, 5]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            reduction='none',
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+        num_head_convs=1,
+        num_protos=32,
+        use_ohem=True),
+    mask_head=dict(
+        type='YOLACTProtonet',
+        in_channels=256,
+        num_protos=32,
+        num_classes=80,
+        max_masks_to_train=100,
+        loss_mask_weight=6.125),
+    segm_head=dict(
+        type='YOLACTSegmHead',
+        num_classes=80,
+        in_channels=256,
+        loss_segm=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        # smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        iou_thr=0.5,
+        top_k=200,
+        max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.68, 116.78, 103.94], std=[58.40, 57.12, 57.38], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(4.0, 4.0)),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(img_size, img_size), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(img_size, img_size),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.1,
+    step=[20, 42, 49, 52])
+runner = dict(type='EpochBasedRunner', max_epochs=55)
+cudnn_benchmark = True
+evaluation = dict(metric=['bbox', 'segm'])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (1 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/configs/yolact/yolact_r50_8x8_coco.py b/configs/yolact/yolact_r50_8x8_coco.py
new file mode 100755
index 0000000..41003ab
--- /dev/null
+++ b/configs/yolact/yolact_r50_8x8_coco.py
@@ -0,0 +1,16 @@
+_base_ = 'yolact_r50_1x8_coco.py'
+
+optimizer = dict(type='SGD', lr=8e-3, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.1,
+    step=[20, 42, 49, 52])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/yolo/README.md b/configs/yolo/README.md
new file mode 100755
index 0000000..c9eb8a6
--- /dev/null
+++ b/configs/yolo/README.md
@@ -0,0 +1,55 @@
+# YOLOv3
+
+> [YOLOv3: An Incremental Improvement](https://arxiv.org/abs/1804.02767)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present some updates to YOLO! We made a bunch of little design changes to make it better. We also trained this new network that's pretty swell. It's a little bigger than last time but more accurate. It's still fast though, don't worry. At 320x320 YOLOv3 runs in 22 ms at 28.2 mAP, as accurate as SSD but three times faster. When we look at the old .5 IOU mAP detection metric YOLOv3 is quite good. It achieves 57.9 mAP@50 in 51 ms on a Titan X, compared to 57.5 mAP@50 in 198 ms by RetinaNet, similar performance but 3.8x faster.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/144001433-b4f7fb5e-3b7a-414b-b949-93733213b670.png" height="300"/>
+</div>
+
+## Results and Models
+
+|  Backbone  | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                      Config                                                      |                                                                                                                                                        Download                                                                                                                                                        |
+| :--------: | :---: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| DarkNet-53 |  320  |  273e   |   2.7    |      63.9      |  27.9  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_320_273e_coco.py)     |                         [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-20200819_172101.log.json)                         |
+| DarkNet-53 |  416  |  273e   |   3.8    |      61.2      |  30.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_mstrain-416_273e_coco.py) |         [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-2b60fcd9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-20200819_173424.log.json)         |
+| DarkNet-53 |  608  |  273e   |   7.4    |      48.1      |  33.7  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_mstrain-608_273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020.log.json) |
+
+## Mixed Precision Training
+
+We also train YOLOv3 with mixed precision training.
+
+|  Backbone  | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                        Config                                                         |                                                                                                                                                                  Download                                                                                                                                                                  |
+| :--------: | :---: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| DarkNet-53 |  608  |  273e   |   4.7    |      48.1      |  33.8  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542-4bc34944.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542.log.json) |
+
+## Lightweight models
+
+|  Backbone   | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP |                                                          Config                                                          |                                                                                                                                                                        Download                                                                                                                                                                        |
+| :---------: | :---: | :-----: | :------: | :------------: | :----: | :----------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| MobileNetV2 |  416  |  300e   |   5.3    |                |  23.9  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823-f68a07b3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823.log.json) |
+| MobileNetV2 |  320  |  300e   |   3.2    |                |  22.2  |     [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolo/yolov3_mobilenetv2_320_300e_coco.py)     |                 [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349.log.json)                 |
+
+Notice: We reduce the number of channels to 96 in both head and neck. It can reduce the flops and parameters, which makes these models more suitable for edge devices.
+
+## Credit
+
+This implementation originates from the project of Haoyu Wu(@wuhy08) at Western Digital.
+
+## Citation
+
+```latex
+@misc{redmon2018yolov3,
+    title={YOLOv3: An Incremental Improvement},
+    author={Joseph Redmon and Ali Farhadi},
+    year={2018},
+    eprint={1804.02767},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
diff --git a/configs/yolo/metafile.yml b/configs/yolo/metafile.yml
new file mode 100755
index 0000000..22c35da
--- /dev/null
+++ b/configs/yolo/metafile.yml
@@ -0,0 +1,124 @@
+Collections:
+  - Name: YOLOv3
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - DarkNet
+    Paper:
+      URL: https://arxiv.org/abs/1804.02767
+      Title: 'YOLOv3: An Incremental Improvement'
+    README: configs/yolo/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/yolo.py#L8
+      Version: v2.4.0
+
+Models:
+  - Name: yolov3_d53_320_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_320_273e_coco.py
+    Metadata:
+      Training Memory (GB): 2.7
+      inference time (ms/im):
+        - value: 15.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (320, 320)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 27.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth
+
+  - Name: yolov3_d53_mstrain-416_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_mstrain-416_273e_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      inference time (ms/im):
+        - value: 16.34
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (416, 416)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 30.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-2b60fcd9.pth
+
+  - Name: yolov3_d53_mstrain-608_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_mstrain-608_273e_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      inference time (ms/im):
+        - value: 20.79
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (608, 608)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 33.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth
+
+  - Name: yolov3_d53_fp16_mstrain-608_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py
+    Metadata:
+      Training Memory (GB): 4.7
+      inference time (ms/im):
+        - value: 20.79
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (608, 608)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 33.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542-4bc34944.pth
+
+  - Name: yolov3_mobilenetv2_320_300e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_mobilenetv2_320_300e_coco.py
+    Metadata:
+      Training Memory (GB): 3.2
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 22.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth
+
+  - Name: yolov3_mobilenetv2_mstrain-416_300e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py
+    Metadata:
+      Training Memory (GB): 5.3
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 23.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823-f68a07b3.pth
diff --git a/configs/yolo/yolov3_d53_320_273e_coco.py b/configs/yolo/yolov3_d53_320_273e_coco.py
new file mode 100755
index 0000000..d4785e3
--- /dev/null
+++ b/configs/yolo/yolov3_d53_320_273e_coco.py
@@ -0,0 +1,42 @@
+_base_ = './yolov3_d53_mstrain-608_273e_coco.py'
+# dataset settings
+img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(320, 320),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py b/configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py
new file mode 100755
index 0000000..4ef2422
--- /dev/null
+++ b/configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py
@@ -0,0 +1,3 @@
+_base_ = './yolov3_d53_mstrain-608_273e_coco.py'
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/configs/yolo/yolov3_d53_mstrain-416_273e_coco.py b/configs/yolo/yolov3_d53_mstrain-416_273e_coco.py
new file mode 100755
index 0000000..94325c5
--- /dev/null
+++ b/configs/yolo/yolov3_d53_mstrain-416_273e_coco.py
@@ -0,0 +1,42 @@
+_base_ = './yolov3_d53_mstrain-608_273e_coco.py'
+# dataset settings
+img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=[(320, 320), (416, 416)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(416, 416),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/yolo/yolov3_d53_mstrain-608_273e_coco.py b/configs/yolo/yolov3_d53_mstrain-608_273e_coco.py
new file mode 100755
index 0000000..43aa2f0
--- /dev/null
+++ b/configs/yolo/yolov3_d53_mstrain-608_273e_coco.py
@@ -0,0 +1,132 @@
+_base_ = '../_base_/default_runtime.py'
+# model settings
+model = dict(
+    type='YOLOV3',
+    backbone=dict(
+        type='Darknet',
+        depth=53,
+        out_indices=(3, 4, 5),
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://darknet53')),
+    neck=dict(
+        type='YOLOV3Neck',
+        num_scales=3,
+        in_channels=[1024, 512, 256],
+        out_channels=[512, 256, 128]),
+    bbox_head=dict(
+        type='YOLOV3Head',
+        num_classes=80,
+        in_channels=[512, 256, 128],
+        out_channels=[1024, 512, 256],
+        anchor_generator=dict(
+            type='YOLOAnchorGenerator',
+            base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                        [(30, 61), (62, 45), (59, 119)],
+                        [(10, 13), (16, 30), (33, 23)]],
+            strides=[32, 16, 8]),
+        bbox_coder=dict(type='YOLOBBoxCoder'),
+        featmap_strides=[32, 16, 8],
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_conf=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_xy=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=2.0,
+            reduction='sum'),
+        loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='GridAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0)),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        conf_thr=0.005,
+        nms=dict(type='nms', iou_threshold=0.45),
+        max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=[(320, 320), (608, 608)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(608, 608),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,  # same as burn-in in darknet
+    warmup_ratio=0.1,
+    step=[218, 246])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=273)
+evaluation = dict(interval=1, metric=['bbox'])
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/yolo/yolov3_mobilenetv2_320_300e_coco.py b/configs/yolo/yolov3_mobilenetv2_320_300e_coco.py
new file mode 100755
index 0000000..477d253
--- /dev/null
+++ b/configs/yolo/yolov3_mobilenetv2_320_300e_coco.py
@@ -0,0 +1,53 @@
+_base_ = ['./yolov3_mobilenetv2_mstrain-416_300e_coco.py']
+
+# yapf:disable
+model = dict(
+    bbox_head=dict(
+        anchor_generator=dict(
+            base_sizes=[[(220, 125), (128, 222), (264, 266)],
+                        [(35, 87), (102, 96), (60, 170)],
+                        [(10, 15), (24, 36), (72, 42)]])))
+# yapf:enable
+
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(320, 320), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(320, 320),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    train=dict(dataset=dict(pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py b/configs/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py
new file mode 100755
index 0000000..18e0622
--- /dev/null
+++ b/configs/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco.py
@@ -0,0 +1,142 @@
+_base_ = '../_base_/default_runtime.py'
+# model settings
+model = dict(
+    type='YOLOV3',
+    backbone=dict(
+        type='MobileNetV2',
+        out_indices=(2, 4, 6),
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://mmdet/mobilenet_v2')),
+    neck=dict(
+        type='YOLOV3Neck',
+        num_scales=3,
+        in_channels=[320, 96, 32],
+        out_channels=[96, 96, 96]),
+    bbox_head=dict(
+        type='YOLOV3Head',
+        num_classes=80,
+        in_channels=[96, 96, 96],
+        out_channels=[96, 96, 96],
+        anchor_generator=dict(
+            type='YOLOAnchorGenerator',
+            base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                        [(30, 61), (62, 45), (59, 119)],
+                        [(10, 13), (16, 30), (33, 23)]],
+            strides=[32, 16, 8]),
+        bbox_coder=dict(type='YOLOBBoxCoder'),
+        featmap_strides=[32, 16, 8],
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_conf=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_xy=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=2.0,
+            reduction='sum'),
+        loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='GridAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0)),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        conf_thr=0.005,
+        nms=dict(type='nms', iou_threshold=0.45),
+        max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(
+        type='Resize',
+        img_scale=[(320, 320), (416, 416)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(416, 416),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=24,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',  # use RepeatDataset to speed up training
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/instances_train2017.json',
+            img_prefix=data_root + 'train2017/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.003, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=4000,
+    warmup_ratio=0.0001,
+    step=[24, 28])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=30)
+evaluation = dict(interval=1, metric=['bbox'])
+find_unused_parameters = True
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (24 samples per GPU)
+auto_scale_lr = dict(base_batch_size=192)
diff --git a/configs/yolof/README.md b/configs/yolof/README.md
new file mode 100755
index 0000000..e88da02
--- /dev/null
+++ b/configs/yolof/README.md
@@ -0,0 +1,35 @@
+# YOLOF
+
+> [You Only Look One-level Feature](https://arxiv.org/abs/2103.09460)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+This paper revisits feature pyramids networks (FPN) for one-stage detectors and points out that the success of FPN is due to its divide-and-conquer solution to the optimization problem in object detection rather than multi-scale feature fusion. From the perspective of optimization, we introduce an alternative way to address the problem instead of adopting the complex feature pyramids - {\\em utilizing only one-level feature for detection}. Based on the simple and efficient solution, we present You Only Look One-level Feature (YOLOF). In our method, two key components, Dilated Encoder and Uniform Matching, are proposed and bring considerable improvements. Extensive experiments on the COCO benchmark prove the effectiveness of the proposed model. Our YOLOF achieves comparable results with its feature pyramids counterpart RetinaNet while being 2.5× faster. Without transformer layers, YOLOF can match the performance of DETR in a single-level feature manner with 7× less training epochs. With an image size of 608×608, YOLOF achieves 44.3 mAP running at 60 fps on 2080Ti, which is 13% faster than YOLOv4.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/144001639-257374ef-7d4f-412b-a783-88abdd22f277.png"/>
+</div>
+
+## Results and Models
+
+| Backbone | Style | Epoch | Lr schd | Mem (GB) | box AP |                                                  Config                                                   |                                                                                                                                         Download                                                                                                                                         |
+| :------: | :---: | :---: | :-----: | :------: | :----: | :-------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| R-50-C5  | caffe |   Y   |   1x    |   8.3    |  37.5  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolof/yolof_r50_c5_8x8_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427.log.json) |
+
+**Note**:
+
+1. We find that the performance is unstable and may fluctuate by about 0.3 mAP. mAP 37.4 ~ 37.7 is acceptable in YOLOF_R_50_C5_1x. Such fluctuation can also be found in the [original implementation](https://github.com/chensnathan/YOLOF).
+2. In addition to instability issues, sometimes there are large loss fluctuations and NAN, so there may still be problems with this project, which will be improved subsequently.
+
+## Citation
+
+```latex
+@inproceedings{chen2021you,
+  title={You Only Look One-level Feature},
+  author={Chen, Qiang and Wang, Yingming and Yang, Tong and Zhang, Xiangyu and Cheng, Jian and Sun, Jian},
+  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+  year={2021}
+}
+```
diff --git a/configs/yolof/metafile.yml b/configs/yolof/metafile.yml
new file mode 100755
index 0000000..9436fee
--- /dev/null
+++ b/configs/yolof/metafile.yml
@@ -0,0 +1,32 @@
+Collections:
+  - Name: YOLOF
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Dilated Encoder
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2103.09460
+      Title: 'You Only Look One-level Feature'
+    README: configs/yolof/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/yolof.py#L6
+      Version: v2.12.0
+
+Models:
+  - Name: yolof_r50_c5_8x8_1x_coco
+    In Collection: YOLOF
+    Config: configs/yolof/yolof_r50_c5_8x8_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.3
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth
diff --git a/configs/yolof/yolof_r50_c5_8x8_1x_coco.py b/configs/yolof/yolof_r50_c5_8x8_1x_coco.py
new file mode 100755
index 0000000..d0b9649
--- /dev/null
+++ b/configs/yolof/yolof_r50_c5_8x8_1x_coco.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='YOLOF',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_caffe')),
+    neck=dict(
+        type='DilatedEncoder',
+        in_channels=2048,
+        out_channels=512,
+        block_mid_channels=128,
+        num_residual_blocks=4,
+        block_dilations=[2, 4, 6, 8]),
+    bbox_head=dict(
+        type='YOLOFHead',
+        num_classes=80,
+        in_channels=512,
+        reg_decoded_bbox=True,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2, 4, 8, 16],
+            strides=[32]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1., 1., 1., 1.],
+            add_ctr_clamp=True,
+            ctr_clamp=32),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='UniformAssigner', pos_ignore_thr=0.15, neg_ignore_thr=0.7),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.12,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        norm_decay_mult=0., custom_keys={'backbone': dict(lr_mult=1. / 3)}))
+lr_config = dict(warmup_iters=1500, warmup_ratio=0.00066667)
+
+# use caffe img_norm
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='RandomShift', shift_ratio=0.5, max_shift_px=32),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/yolof/yolof_r50_c5_8x8_iter-1x_coco.py b/configs/yolof/yolof_r50_c5_8x8_iter-1x_coco.py
new file mode 100755
index 0000000..c95c02d
--- /dev/null
+++ b/configs/yolof/yolof_r50_c5_8x8_iter-1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './yolof_r50_c5_8x8_1x_coco.py'
+
+# We implemented the iter-based config according to the source code.
+# COCO dataset has 117266 images after filtering. We use 8 gpu and
+# 8 batch size training, so 22500 is equivalent to
+# 22500/(117266/(8x8))=12.3 epoch, 15000 is equivalent to 8.2 epoch,
+# 20000 is equivalent to 10.9 epoch. Due to lr(0.12) is large,
+# the iter-based and epoch-based setting have about 0.2 difference on
+# the mAP evaluation value.
+lr_config = dict(step=[15000, 20000])
+runner = dict(_delete_=True, type='IterBasedRunner', max_iters=22500)
+checkpoint_config = dict(interval=2500)
+evaluation = dict(interval=4500)
+log_config = dict(interval=20)
diff --git a/configs/yolox/README.md b/configs/yolox/README.md
new file mode 100755
index 0000000..4890fbd
--- /dev/null
+++ b/configs/yolox/README.md
@@ -0,0 +1,39 @@
+# YOLOX
+
+> [YOLOX: Exceeding YOLO Series in 2021](https://arxiv.org/abs/2107.08430)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3% AP on COCO, surpassing NanoDet by 1.8% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3% AP on COCO, outperforming the current best practice by 3.0% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/144001736-9fb303dd-eac7-46b0-ad45-214cfa51e928.png"/>
+</div>
+
+## Results and Models
+
+|  Backbone  | size | Mem (GB) | box AP |                                                  Config                                                   |                                                                                                                                         Download                                                                                                                                         |
+| :--------: | :--: | :------: | :----: | :-------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOX-tiny | 416  |   3.5    |  32.0  | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_tiny_8x8_300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234.log.json) |
+|  YOLOX-s   | 640  |   7.6    |  40.5  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_s_8x8_300e_coco.py)   |       [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711.log.json)       |
+|  YOLOX-l   | 640  |   19.9   |  49.4  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_l_8x8_300e_coco.py)   |       [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236.log.json)       |
+|  YOLOX-x   | 640  |   28.1   |  50.9  |  [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/yolox/yolox_x_8x8_300e_coco.py)   |       [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254.log.json)       |
+
+**Note**:
+
+1. The test score threshold is 0.001, and the box AP indicates the best AP.
+2. Due to the need for pre-training weights, we cannot reproduce the performance of the `yolox-nano` model. Please refer to https://github.com/Megvii-BaseDetection/YOLOX/issues/674 for more information.
+3. We also trained the model by the official release of YOLOX based on [Megvii-BaseDetection/YOLOX#735](https://github.com/Megvii-BaseDetection/YOLOX/issues/735) with commit ID [38c633](https://github.com/Megvii-BaseDetection/YOLOX/tree/38c633bf176462ee42b110c70e4ffe17b5753208). We found that the best AP of `YOLOX-tiny`, `YOLOX-s`, `YOLOX-l`, and `YOLOX-x` is 31.8, 40.3, 49.2, and 50.9, respectively. The performance is consistent with that of our re-implementation (see Table above) but still has a gap (0.3~0.8 AP) in comparison with the reported performance in their [README](https://github.com/Megvii-BaseDetection/YOLOX/blob/38c633bf176462ee42b110c70e4ffe17b5753208/README.md#benchmark).
+
+## Citation
+
+```latex
+@article{yolox2021,
+  title={{YOLOX}: Exceeding YOLO Series in 2021},
+  author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
+  journal={arXiv preprint arXiv:2107.08430},
+  year={2021}
+}
+```
diff --git a/configs/yolox/metafile.yml b/configs/yolox/metafile.yml
new file mode 100755
index 0000000..845cb0a
--- /dev/null
+++ b/configs/yolox/metafile.yml
@@ -0,0 +1,70 @@
+Collections:
+  - Name: YOLOX
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Nesterov
+        - Weight Decay
+        - Cosine Annealing Lr Updater
+      Training Resources: 8x TITANXp GPUs
+      Architecture:
+        - CSPDarkNet
+        - PAFPN
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'YOLOX: Exceeding YOLO Series in 2021'
+    README: configs/yolox/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.15.1/mmdet/models/detectors/yolox.py#L6
+      Version: v2.15.1
+
+
+Models:
+  - Name: yolox_s_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_s_8x8_300e_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth
+  - Name: yolox_l_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_l_8x8_300e_coco.py
+    Metadata:
+      Training Memory (GB): 19.9
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth
+  - Name: yolox_x_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_x_8x8_300e_coco.py
+    Metadata:
+      Training Memory (GB): 28.1
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth
+  - Name: yolox_tiny_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_tiny_8x8_300e_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 32.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth
diff --git a/configs/yolox/yolox_l_8x8_300e_coco.py b/configs/yolox/yolox_l_8x8_300e_coco.py
new file mode 100755
index 0000000..dcbfa18
--- /dev/null
+++ b/configs/yolox/yolox_l_8x8_300e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './yolox_s_8x8_300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=1.0, widen_factor=1.0),
+    neck=dict(
+        in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3),
+    bbox_head=dict(in_channels=256, feat_channels=256))
diff --git a/configs/yolox/yolox_m_8x8_300e_coco.py b/configs/yolox/yolox_m_8x8_300e_coco.py
new file mode 100755
index 0000000..3048c95
--- /dev/null
+++ b/configs/yolox/yolox_m_8x8_300e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './yolox_s_8x8_300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+    neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+    bbox_head=dict(in_channels=192, feat_channels=192),
+)
diff --git a/configs/yolox/yolox_nano_8x8_300e_coco.py b/configs/yolox/yolox_nano_8x8_300e_coco.py
new file mode 100755
index 0000000..d33ed04
--- /dev/null
+++ b/configs/yolox/yolox_nano_8x8_300e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './yolox_tiny_8x8_300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=0.33, widen_factor=0.25, use_depthwise=True),
+    neck=dict(
+        in_channels=[64, 128, 256],
+        out_channels=64,
+        num_csp_blocks=1,
+        use_depthwise=True),
+    bbox_head=dict(in_channels=64, feat_channels=64, use_depthwise=True))
diff --git a/configs/yolox/yolox_s_8x8_300e_coco.py b/configs/yolox/yolox_s_8x8_300e_coco.py
new file mode 100755
index 0000000..97ff23e
--- /dev/null
+++ b/configs/yolox/yolox_s_8x8_300e_coco.py
@@ -0,0 +1,165 @@
+_base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py']
+
+img_scale = (640, 640)  # height, width
+
+# model settings
+model = dict(
+    type='YOLOX',
+    input_size=img_scale,
+    random_size_range=(15, 25),
+    random_size_interval=10,
+    backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5),
+    neck=dict(
+        type='YOLOXPAFPN',
+        in_channels=[128, 256, 512],
+        out_channels=128,
+        num_csp_blocks=1),
+    bbox_head=dict(
+        type='YOLOXHead', num_classes=80, in_channels=128, feat_channels=128),
+    train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
+    # In order to align the source code, the threshold of the val phase is
+    # 0.01, and the threshold of the test phase is 0.001.
+    test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
+
+# dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    # According to the official implementation, multi-scale
+    # training is not considered here but in the
+    # 'mmdet/models/detectors/yolox.py'.
+    dict(type='Resize', img_scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        # If the image is three-channel, the pad value needs
+        # to be set separately for each channel.
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+train_dataset = dict(
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_empty_gt=False,
+    ),
+    pipeline=train_pipeline)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=img_scale,
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(
+                type='Pad',
+                pad_to_square=True,
+                pad_val=dict(img=(114.0, 114.0, 114.0))),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=train_dataset,
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+
+# optimizer
+# default 8 gpu
+optimizer = dict(
+    type='SGD',
+    lr=0.01,
+    momentum=0.9,
+    weight_decay=5e-4,
+    nesterov=True,
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
+optimizer_config = dict(grad_clip=None)
+
+max_epochs = 300
+num_last_epochs = 15
+resume_from = None
+interval = 10
+
+# learning policy
+lr_config = dict(
+    _delete_=True,
+    policy='YOLOX',
+    warmup='exp',
+    by_epoch=False,
+    warmup_by_epoch=True,
+    warmup_ratio=1,
+    warmup_iters=5,  # 5 epoch
+    num_last_epochs=num_last_epochs,
+    min_lr_ratio=0.05)
+
+runner = dict(type='EpochBasedRunner', max_epochs=max_epochs)
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(
+        type='SyncNormHook',
+        num_last_epochs=num_last_epochs,
+        interval=interval,
+        priority=48),
+    dict(
+        type='ExpMomentumEMAHook',
+        resume_from=resume_from,
+        momentum=0.0001,
+        priority=49)
+]
+checkpoint_config = dict(interval=interval)
+evaluation = dict(
+    save_best='auto',
+    # The evaluation interval is 'interval' when running epoch is
+    # less than ‘max_epochs - num_last_epochs’.
+    # The evaluation interval is 1 when running epoch is greater than
+    # or equal to ‘max_epochs - num_last_epochs’.
+    interval=interval,
+    dynamic_intervals=[(max_epochs - num_last_epochs, 1)],
+    metric='bbox')
+log_config = dict(interval=50)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/yolox/yolox_tiny_8x8_300e_coco.py b/configs/yolox/yolox_tiny_8x8_300e_coco.py
new file mode 100755
index 0000000..75931ba
--- /dev/null
+++ b/configs/yolox/yolox_tiny_8x8_300e_coco.py
@@ -0,0 +1,58 @@
+_base_ = './yolox_s_8x8_300e_coco.py'
+
+# model settings
+model = dict(
+    random_size_range=(10, 20),
+    backbone=dict(deepen_factor=0.33, widen_factor=0.375),
+    neck=dict(in_channels=[96, 192, 384], out_channels=96),
+    bbox_head=dict(in_channels=96, feat_channels=96))
+
+img_scale = (640, 640)  # height, width
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.5, 1.5),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Resize', img_scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(416, 416),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(
+                type='Pad',
+                pad_to_square=True,
+                pad_val=dict(img=(114.0, 114.0, 114.0))),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+
+train_dataset = dict(pipeline=train_pipeline)
+
+data = dict(
+    train=train_dataset,
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/yolox/yolox_x_8x8_300e_coco.py b/configs/yolox/yolox_x_8x8_300e_coco.py
new file mode 100755
index 0000000..65c0b75
--- /dev/null
+++ b/configs/yolox/yolox_x_8x8_300e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './yolox_s_8x8_300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+    neck=dict(
+        in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+    bbox_head=dict(in_channels=320, feat_channels=320))
diff --git a/data/cityscapes b/data/cityscapes
new file mode 120000
index 0000000..db3d63f
--- /dev/null
+++ b/data/cityscapes
@@ -0,0 +1 @@
+/datasets_master/cityscapes
\ No newline at end of file
diff --git a/data/diverse_weather b/data/diverse_weather
new file mode 120000
index 0000000..a98bec7
--- /dev/null
+++ b/data/diverse_weather
@@ -0,0 +1 @@
+/datasets_master/Diverse-Weather
\ No newline at end of file
diff --git a/data/mapillary_trafficsign b/data/mapillary_trafficsign
new file mode 120000
index 0000000..9161068
--- /dev/null
+++ b/data/mapillary_trafficsign
@@ -0,0 +1 @@
+/datasets_master/mapillary_trafficsign/
\ No newline at end of file
diff --git a/data/target_domains b/data/target_domains
new file mode 120000
index 0000000..e04b829
--- /dev/null
+++ b/data/target_domains
@@ -0,0 +1 @@
+/home/thvu/shared/thvu/cache/mmdetection/PODA/target_domains/
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100755
index 0000000..af53cb5
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,29 @@
+ARG PYTORCH="1.6.0"
+ARG CUDA="10.1"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
+
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install MMCV
+RUN pip install --no-cache-dir --upgrade pip wheel setuptools
+RUN pip install --no-cache-dir mmcv-full==1.3.17 -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
+
+# Install MMDetection
+RUN conda clean --all
+RUN git clone https://github.com/open-mmlab/mmdetection.git /mmdetection
+WORKDIR /mmdetection
+ENV FORCE_CUDA="1"
+RUN pip install --no-cache-dir -r requirements/build.txt
+RUN pip install --no-cache-dir -e .
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
new file mode 100755
index 0000000..c53613c
--- /dev/null
+++ b/docker/serve/Dockerfile
@@ -0,0 +1,49 @@
+ARG PYTORCH="1.6.0"
+ARG CUDA="10.1"
+ARG CUDNN="7"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ARG MMCV="1.3.17"
+ARG MMDET="2.28.2"
+
+ENV PYTHONUNBUFFERED TRUE
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    g++ \
+    openjdk-11-jre-headless \
+    # MMDet Requirements
+    ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV PATH="/opt/conda/bin:$PATH"
+RUN export FORCE_CUDA=1
+
+# TORCHSEVER
+RUN pip install torchserve torch-model-archiver
+
+# MMLAB
+ARG PYTORCH
+ARG CUDA
+RUN ["/bin/bash", "-c", "pip install mmcv-full==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"]
+RUN pip install mmdet==${MMDET}
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp
+
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+
+RUN chmod +x /usr/local/bin/entrypoint.sh \
+    && chown -R model-server /home/model-server
+
+COPY config.properties /home/model-server/config.properties
+RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
+
+EXPOSE 8080 8081 8082
+
+USER model-server
+WORKDIR /home/model-server
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["serve"]
diff --git a/docker/serve/config.properties b/docker/serve/config.properties
new file mode 100755
index 0000000..efb9c47
--- /dev/null
+++ b/docker/serve/config.properties
@@ -0,0 +1,5 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+model_store=/home/model-server/model-store
+load_models=all
diff --git a/docker/serve/entrypoint.sh b/docker/serve/entrypoint.sh
new file mode 100755
index 0000000..41ba00b
--- /dev/null
+++ b/docker/serve/entrypoint.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+if [[ "$1" = "serve" ]]; then
+    shift 1
+    torchserve --start --ts-config /home/model-server/config.properties
+else
+    eval "$@"
+fi
+
+# prevent docker exit
+tail -f /dev/null
diff --git a/docs/en/1_exist_data_model.md b/docs/en/1_exist_data_model.md
new file mode 100755
index 0000000..28cd39a
--- /dev/null
+++ b/docs/en/1_exist_data_model.md
@@ -0,0 +1,697 @@
+# 1: Inference and train with existing models and standard datasets
+
+MMDetection provides hundreds of existing and existing detection models in [Model Zoo](https://mmdetection.readthedocs.io/en/latest/model_zoo.html)), and supports multiple standard datasets, including Pascal VOC, COCO, CityScapes, LVIS, etc. This note will show how to perform common tasks on these existing models and standard datasets, including:
+
+- Use existing models to inference on given images.
+- Test existing models on standard datasets.
+- Train predefined models on standard datasets.
+
+## Inference with existing models
+
+By inference, we mean using trained models to detect objects on images. In MMDetection, a model is defined by a configuration file and existing model parameters are save in a checkpoint file.
+
+To start with, we recommend [Faster RCNN](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn) with this [configuration file](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) and this [checkpoint file](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth). It is recommended to download the checkpoint file to `checkpoints` directory.
+
+### High-level APIs for inference
+
+MMDetection provide high-level Python APIs for inference on images. Here is an example of building the model and inference on given images or videos.
+
+```python
+from mmdet.apis import init_detector, inference_detector
+import mmcv
+
+# Specify the path to model config and checkpoint file
+config_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+
+# build the model from a config file and a checkpoint file
+model = init_detector(config_file, checkpoint_file, device='cuda:0')
+
+# test a single image and show the results
+img = 'test.jpg'  # or img = mmcv.imread(img), which will only load it once
+result = inference_detector(model, img)
+# visualize the results in a new window
+model.show_result(img, result)
+# or save the visualization results to image files
+model.show_result(img, result, out_file='result.jpg')
+
+# test a video and show the results
+video = mmcv.VideoReader('video.mp4')
+for frame in video:
+    result = inference_detector(model, frame)
+    model.show_result(frame, result, wait_time=1)
+```
+
+A notebook demo can be found in [demo/inference_demo.ipynb](https://github.com/open-mmlab/mmdetection/blob/master/demo/inference_demo.ipynb).
+
+Note:  `inference_detector` only supports single-image inference for now.
+
+### Asynchronous interface - supported for Python 3.7+
+
+For Python 3.7+, MMDetection also supports async interfaces.
+By utilizing CUDA streams, it allows not to block CPU on GPU bound inference code and enables better CPU/GPU utilization for single-threaded application. Inference can be done concurrently either between different input data samples or between different models of some inference pipeline.
+
+See `tests/async_benchmark.py` to compare the speed of synchronous and asynchronous interfaces.
+
+```python
+import asyncio
+import torch
+from mmdet.apis import init_detector, async_inference_detector
+from mmdet.utils.contextmanagers import concurrent
+
+async def main():
+    config_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+    checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+    device = 'cuda:0'
+    model = init_detector(config_file, checkpoint=checkpoint_file, device=device)
+
+    # queue is used for concurrent inference of multiple images
+    streamqueue = asyncio.Queue()
+    # queue size defines concurrency level
+    streamqueue_size = 3
+
+    for _ in range(streamqueue_size):
+        streamqueue.put_nowait(torch.cuda.Stream(device=device))
+
+    # test a single image and show the results
+    img = 'test.jpg'  # or img = mmcv.imread(img), which will only load it once
+
+    async with concurrent(streamqueue):
+        result = await async_inference_detector(model, img)
+
+    # visualize the results in a new window
+    model.show_result(img, result)
+    # or save the visualization results to image files
+    model.show_result(img, result, out_file='result.jpg')
+
+
+asyncio.run(main())
+
+```
+
+### Demos
+
+We also provide three demo scripts, implemented with high-level APIs and supporting functionality codes.
+Source codes are available [here](https://github.com/open-mmlab/mmdetection/tree/master/demo).
+
+#### Image demo
+
+This script performs inference on a single image.
+
+```shell
+python demo/image_demo.py \
+    ${IMAGE_FILE} \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--device ${GPU_ID}] \
+    [--score-thr ${SCORE_THR}]
+```
+
+Examples:
+
+```shell
+python demo/image_demo.py demo/demo.jpg \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+    --device cpu
+```
+
+#### Webcam demo
+
+This is a live demo from a webcam.
+
+```shell
+python demo/webcam_demo.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--device ${GPU_ID}] \
+    [--camera-id ${CAMERA-ID}] \
+    [--score-thr ${SCORE_THR}]
+```
+
+Examples:
+
+```shell
+python demo/webcam_demo.py \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
+```
+
+#### Video demo
+
+This script performs inference on a video.
+
+```shell
+python demo/video_demo.py \
+    ${VIDEO_FILE} \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--device ${GPU_ID}] \
+    [--score-thr ${SCORE_THR}] \
+    [--out ${OUT_FILE}] \
+    [--show] \
+    [--wait-time ${WAIT_TIME}]
+```
+
+Examples:
+
+```shell
+python demo/video_demo.py demo/demo.mp4 \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+    --out result.mp4
+```
+
+#### Video demo with GPU acceleration
+
+This script performs inference on a video with GPU acceleration.
+
+```shell
+python demo/video_gpuaccel_demo.py \
+    ${VIDEO_FILE} \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--device ${GPU_ID}] \
+    [--score-thr ${SCORE_THR}] \
+    [--nvdecode] \
+    [--out ${OUT_FILE}] \
+    [--show] \
+    [--wait-time ${WAIT_TIME}]
+```
+
+Examples:
+
+```shell
+python demo/video_gpuaccel_demo.py demo/demo.mp4 \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+    --nvdecode --out result.mp4
+```
+
+## Test existing models on standard datasets
+
+To evaluate a model's accuracy, one usually tests the model on some standard datasets.
+MMDetection supports multiple public datasets including COCO, Pascal VOC, CityScapes, and [more](https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets).
+This section will show how to test existing models on supported datasets.
+
+### Prepare datasets
+
+Public datasets like [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/index.html) or mirror and [COCO](https://cocodataset.org/#download) are available from official websites or mirrors. Note: In the detection task, Pascal VOC 2012 is an extension of Pascal VOC 2007 without overlap, and we usually use them together.
+It is recommended to download and extract the dataset somewhere outside the project directory and symlink the dataset root to `$MMDETECTION/data` as below.
+If your folder structure is different, you may need to change the corresponding paths in config files.
+
+We provide a script to download datasets such as COCO , you can run `python tools/misc/download_dataset.py --dataset-name coco2017` to download COCO dataset.
+
+For more usage please refer to [dataset-download](https://github.com/open-mmlab/mmdetection/tree/master/docs/en/useful_tools.md#dataset-download)
+
+```text
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   ├── cityscapes
+│   │   ├── annotations
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── VOCdevkit
+│   │   ├── VOC2007
+│   │   ├── VOC2012
+```
+
+Some models require additional [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) datasets, such as HTC, DetectoRS and SCNet, you can download and unzip then move to the coco folder. The directory should be like this.
+
+```text
+mmdetection
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   │   ├── stuffthingmaps
+```
+
+Panoptic segmentation models like PanopticFPN require additional [COCO Panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) datasets, you can download and unzip then move to the coco annotation folder. The directory should be like this.
+
+```text
+mmdetection
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── panoptic_train2017.json
+│   │   │   ├── panoptic_train2017
+│   │   │   ├── panoptic_val2017.json
+│   │   │   ├── panoptic_val2017
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+```
+
+The [cityscapes](https://www.cityscapes-dataset.com/) annotations need to be converted into the coco format using `tools/dataset_converters/cityscapes.py`:
+
+```shell
+pip install cityscapesscripts
+
+python tools/dataset_converters/cityscapes.py \
+    ./data/cityscapes \
+    --nproc 8 \
+    --out-dir ./data/cityscapes/annotations
+```
+
+TODO: CHANGE TO THE NEW PATH
+
+### Test existing models
+
+We provide testing scripts for evaluating an existing model on the whole dataset (COCO, PASCAL VOC, Cityscapes, etc.).
+The following testing environments are supported:
+
+- single GPU
+- CPU
+- single node multiple GPUs
+- multiple nodes
+
+Choose the proper script to perform testing depending on the testing environment.
+
+```shell
+# single-gpu testing
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}] \
+    [--show]
+
+# CPU: disable GPUs and run single-gpu testing script
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}] \
+    [--show]
+
+# multi-gpu testing
+bash tools/dist_test.sh \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    ${GPU_NUM} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}]
+```
+
+`tools/dist_test.sh` also supports multi-node testing, but relies on PyTorch's [launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility).
+
+Optional arguments:
+
+- `RESULT_FILE`: Filename of the output results in pickle format. If not specified, the results will not be saved to a file.
+- `EVAL_METRICS`: Items to be evaluated on the results. Allowed values depend on the dataset, e.g., `proposal_fast`, `proposal`, `bbox`, `segm` are available for COCO, `mAP`, `recall` for PASCAL VOC. Cityscapes could be evaluated by `cityscapes` as well as all COCO metrics.
+- `--show`: If specified, detection results will be plotted on the images and shown in a new window. It is only applicable to single GPU testing and used for debugging and visualization. Please make sure that GUI is available in your environment. Otherwise, you may encounter an error like `cannot connect to X server`.
+- `--show-dir`: If specified, detection results will be plotted on the images and saved to the specified directory. It is only applicable to single GPU testing and used for debugging and visualization. You do NOT need a GUI available in your environment for using this option.
+- `--show-score-thr`: If specified, detections with scores below this threshold will be removed.
+- `--cfg-options`:  if specified, the key-value pair optional cfg will be merged into config file
+- `--eval-options`: if specified, the key-value pair optional eval cfg will be kwargs for dataset.evaluate() function, it's only for evaluation
+
+### Examples
+
+Assuming that you have already downloaded the checkpoints to the directory `checkpoints/`.
+
+1. Test Faster R-CNN and visualize the results. Press any key for the next image.
+   Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn).
+
+   ```shell
+   python tools/test.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+       --show
+   ```
+
+2. Test Faster R-CNN and save the painted images for future visualization.
+   Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn).
+
+   ```shell
+   python tools/test.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+       --show-dir faster_rcnn_r50_fpn_1x_results
+   ```
+
+3. Test Faster R-CNN on PASCAL VOC (without saving the test results) and evaluate the mAP.
+   Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc).
+
+   ```shell
+   python tools/test.py \
+       configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc.py \
+       checkpoints/faster_rcnn_r50_fpn_1x_voc0712_20200624-c9895d40.pth \
+       --eval mAP
+   ```
+
+4. Test Mask R-CNN with 8 GPUs, and evaluate the bbox and mask AP.
+   Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn).
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/mask_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+       8 \
+       --out results.pkl \
+       --eval bbox segm
+   ```
+
+5. Test Mask R-CNN with 8 GPUs, and evaluate the **classwise** bbox and mask AP.
+   Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn).
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+       8 \
+       --out results.pkl \
+       --eval bbox segm \
+       --options "classwise=True"
+   ```
+
+6. Test Mask R-CNN on COCO test-dev with 8 GPUs, and generate JSON files for submitting to the official evaluation server.
+   Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn).
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+       8 \
+       --format-only \
+       --options "jsonfile_prefix=./mask_rcnn_test-dev_results"
+   ```
+
+   This command generates two JSON files `mask_rcnn_test-dev_results.bbox.json` and `mask_rcnn_test-dev_results.segm.json`.
+
+7. Test Mask R-CNN on Cityscapes test with 8 GPUs, and generate txt and png files for submitting to the official evaluation server.
+   Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/master/configs/cityscapes).
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_cityscapes_20200227-afe51d5a.pth \
+       8 \
+       --format-only \
+       --options "txtfile_prefix=./mask_rcnn_cityscapes_test_results"
+   ```
+
+   The generated png and txt would be under `./mask_rcnn_cityscapes_test_results` directory.
+
+### Test without Ground Truth Annotations
+
+MMDetection supports to test models without ground-truth annotations using `CocoDataset`. If your dataset format is not in COCO format, please convert them to COCO format. For example, if your dataset format is VOC, you can directly convert it to COCO format by the [script in tools.](https://github.com/open-mmlab/mmdetection/tree/master/tools/dataset_converters/pascal_voc.py) If your dataset format is Cityscapes, you can directly convert it to COCO format by the [script in tools.](https://github.com/open-mmlab/mmdetection/tree/master/tools/dataset_converters/cityscapes.py) The rest of the formats can be converted using [this script](https://github.com/open-mmlab/mmdetection/tree/master/tools/dataset_converters/images2coco.py).
+
+```shel
+python tools/dataset_converters/images2coco.py \
+    ${IMG_PATH} \
+    ${CLASSES} \
+    ${OUT} \
+    [--exclude-extensions]
+```
+
+arguments：
+
+- `IMG_PATH`: The root path of images.
+- `CLASSES`: The text file with a list of categories.
+- `OUT`: The output annotation json file name. The save dir is in the same directory as `IMG_PATH`.
+- `exclude-extensions`: The suffix of images to be excluded, such as 'png' and 'bmp'.
+
+After the conversion is complete, you can use the following command to test
+
+```shell
+# single-gpu testing
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    --format-only \
+    --options ${JSONFILE_PREFIX} \
+    [--show]
+
+# CPU: disable GPUs and run single-gpu testing script
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}] \
+    [--show]
+
+# multi-gpu testing
+bash tools/dist_test.sh \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    ${GPU_NUM} \
+    --format-only \
+    --options ${JSONFILE_PREFIX} \
+    [--show]
+```
+
+Assuming that the checkpoints in the [model zoo](https://mmdetection.readthedocs.io/en/latest/modelzoo_statistics.html) have been downloaded to the directory `checkpoints/`, we can test Mask R-CNN on COCO test-dev with 8 GPUs, and generate JSON files using the following command.
+
+```sh
+./tools/dist_test.sh \
+    configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+    8 \
+    --format-only \
+    --options "jsonfile_prefix=./mask_rcnn_test-dev_results"
+```
+
+This command generates two JSON files `mask_rcnn_test-dev_results.bbox.json` and `mask_rcnn_test-dev_results.segm.json`.
+
+### Batch Inference
+
+MMDetection supports inference with a single image or batched images in test mode. By default, we use single-image inference and you can use batch inference by modifying `samples_per_gpu` in the config of test data. You can do that either by modifying the config as below.
+
+```shell
+data = dict(train=dict(...), val=dict(...), test=dict(samples_per_gpu=2, ...))
+```
+
+Or you can set it through `--cfg-options` as `--cfg-options data.test.samples_per_gpu=2`
+
+### Deprecated ImageToTensor
+
+In test mode,  `ImageToTensor`  pipeline is deprecated, it's replaced by `DefaultFormatBundle` that recommended to manually replace it in the test data pipeline in your config file.  examples:
+
+```python
+# use ImageToTensor (deprecated)
+pipelines = [
+   dict(type='LoadImageFromFile'),
+   dict(
+       type='MultiScaleFlipAug',
+       img_scale=(1333, 800),
+       flip=False,
+       transforms=[
+           dict(type='Resize', keep_ratio=True),
+           dict(type='RandomFlip'),
+           dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+           dict(type='Pad', size_divisor=32),
+           dict(type='ImageToTensor', keys=['img']),
+           dict(type='Collect', keys=['img']),
+       ])
+   ]
+
+# manually replace ImageToTensor to DefaultFormatBundle (recommended)
+pipelines = [
+   dict(type='LoadImageFromFile'),
+   dict(
+       type='MultiScaleFlipAug',
+       img_scale=(1333, 800),
+       flip=False,
+       transforms=[
+           dict(type='Resize', keep_ratio=True),
+           dict(type='RandomFlip'),
+           dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+           dict(type='Pad', size_divisor=32),
+           dict(type='DefaultFormatBundle'),
+           dict(type='Collect', keys=['img']),
+       ])
+   ]
+```
+
+## Train predefined models on standard datasets
+
+MMDetection also provides out-of-the-box tools for training detection models.
+This section will show how to train _predefined_ models (under [configs](https://github.com/open-mmlab/mmdetection/tree/master/configs)) on standard datasets i.e. COCO.
+
+### Prepare datasets
+
+Training requires preparing datasets too. See section [Prepare datasets](#prepare-datasets) above for details.
+
+**Note**:
+Currently, the config files under `configs/cityscapes` use COCO pretrained weights to initialize.
+You could download the existing models in advance if the network connection is unavailable or slow. Otherwise, it would cause errors at the beginning of training.
+
+### Learning rate automatically scale
+
+**Important**: The default learning rate in config files is for 8 GPUs and 2 sample per gpu (batch size = 8 * 2 = 16). And it had been set to `auto_scale_lr.base_batch_size` in `config/_base_/default_runtime.py`. Learning rate will be automatically scaled base on this value when the batch size is `16`. Meanwhile, in order not to affect other codebase which based on mmdet, the flag `auto_scale_lr.enable` is set to `False` by default.
+
+If you want to enable this feature, you need to add argument `--auto-scale-lr`. And you need to check the config name which you want to use before you process the command, because the config name indicates the default batch size.
+By default, it is `8 x 2 = 16 batch size`, like `faster_rcnn_r50_caffe_fpn_90k_coco.py` or `pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py`. In other cases, you will see the config file name have `_NxM_` in dictating, like `cornernet_hourglass104_mstest_32x3_210e_coco.py` which batch size is `32 x 3 = 96`, or `scnet_x101_64x4d_fpn_8x1_20e_coco.py` which batch size is `8 x 1 = 8`.
+
+**Please remember to check the bottom of the specific config file you want to use, it will have `auto_scale_lr.base_batch_size` if the batch size is not `16`. If you can't find those values, check the config file which in `_base_=[xxx]` and you will find it. Please do not modify its values if you want to automatically scale the LR.**
+
+Learning rate automatically scale basic usage is as follows.
+
+```shell
+python tools/train.py \
+    ${CONFIG_FILE} \
+    --auto-scale-lr \
+    [optional arguments]
+```
+
+If you enabled this feature, the learning rate will be automatically scaled according to the number of GPUs of the machine and the batch size of training. See [linear scaling rule](https://arxiv.org/abs/1706.02677) for details. For example, If there are 4 GPUs and 2 pictures on each GPU, `lr = 0.01`, then if there are 16 GPUs and 4 pictures on each GPU, it will automatically scale to `lr = 0.08`.
+
+If you don't want to use it, you need to calculate the learning rate according to the [linear scaling rule](https://arxiv.org/abs/1706.02677) manually then change `optimizer.lr` in specific config file.
+
+### Training on a single GPU
+
+We provide `tools/train.py` to launch training jobs on a single GPU.
+The basic usage is as follows.
+
+```shell
+python tools/train.py \
+    ${CONFIG_FILE} \
+    [optional arguments]
+```
+
+During training, log files and checkpoints will be saved to the working directory, which is specified by `work_dir` in the config file or via CLI argument `--work-dir`.
+
+By default, the model is evaluated on the validation set every epoch, the evaluation interval can be specified in the config file as shown below.
+
+```python
+# evaluate the model every 12 epoch.
+evaluation = dict(interval=12)
+```
+
+This tool accepts several optional arguments, including:
+
+- `--no-validate` (**not suggested**): Disable evaluation during training.
+- `--work-dir ${WORK_DIR}`: Override the working directory.
+- `--resume-from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file.
+- `--options 'Key=value'`: Overrides other settings in the used config.
+
+**Note**:
+
+Difference between `resume-from` and `load-from`:
+
+`resume-from` loads both the model weights and optimizer status, and the epoch is also inherited from the specified checkpoint. It is usually used for resuming the training process that is interrupted accidentally.
+`load-from` only loads the model weights and the training epoch starts from 0. It is usually used for finetuning.
+
+### Training on CPU
+
+The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+And then run the script [above](#training-on-a-single-GPU).
+
+**Note**:
+
+We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience.
+
+### Training on multiple GPUs
+
+We provide `tools/dist_train.sh` to launch training on multiple GPUs.
+The basic usage is as follows.
+
+```shell
+bash ./tools/dist_train.sh \
+    ${CONFIG_FILE} \
+    ${GPU_NUM} \
+    [optional arguments]
+```
+
+Optional arguments remain the same as stated [above](#training-on-a-single-GPU).
+
+#### Launch multiple jobs simultaneously
+
+If you would like to launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
+you need to specify different ports (29500 by default) for each job to avoid communication conflict.
+
+If you use `dist_train.sh` to launch training jobs, you can set the port in commands.
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+### Train with multiple machines
+
+If you launch with multiple machines simply connected with ethernet, you can simply run following commands:
+
+On the first machine:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+On the second machine:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+Usually it is slow if you do not have high speed networking like InfiniBand.
+
+### Manage jobs with Slurm
+
+[Slurm](https://slurm.schedmd.com/) is a good job scheduling system for computing clusters.
+On a cluster managed by Slurm, you can use `slurm_train.sh` to spawn training jobs. It supports both single-node and multi-node training.
+
+The basic usage is as follows.
+
+```shell
+[GPUS=${GPUS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR}
+```
+
+Below is an example of using 16 GPUs to train Mask R-CNN on a Slurm partition named _dev_, and set the work-dir to some shared file systems.
+
+```shell
+GPUS=16 ./tools/slurm_train.sh dev mask_r50_1x configs/mask_rcnn_r50_fpn_1x_coco.py /nfs/xxxx/mask_rcnn_r50_fpn_1x
+```
+
+You can check [the source code](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) to review full arguments and environment variables.
+
+When using Slurm, the port option need to be set in one of the following ways:
+
+1. Set the port through `--options`. This is more recommended since it does not change the original configs.
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} --options 'dist_params.port=29500'
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} --options 'dist_params.port=29501'
+   ```
+
+2. Modify the config files to set different communication ports.
+
+   In `config1.py`, set
+
+   ```python
+   dist_params = dict(backend='nccl', port=29500)
+   ```
+
+   In `config2.py`, set
+
+   ```python
+   dist_params = dict(backend='nccl', port=29501)
+   ```
+
+   Then you can launch two jobs with `config1.py` and `config2.py`.
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
+   ```
diff --git a/docs/en/2_new_data_model.md b/docs/en/2_new_data_model.md
new file mode 100755
index 0000000..408e8d1
--- /dev/null
+++ b/docs/en/2_new_data_model.md
@@ -0,0 +1,266 @@
+# 2: Train with customized datasets
+
+In this note, you will know how to inference, test, and train predefined models with customized datasets. We use the [balloon dataset](https://github.com/matterport/Mask_RCNN/tree/master/samples/balloon) as an example to describe the whole process.
+
+The basic steps are as below:
+
+1. Prepare the customized dataset
+2. Prepare a config
+3. Train, test, inference models on the customized dataset.
+
+## Prepare the customized dataset
+
+There are three ways to support a new dataset in MMDetection:
+
+1. reorganize the dataset into COCO format.
+2. reorganize the dataset into a middle format.
+3. implement a new dataset.
+
+Usually we recommend to use the first two methods which are usually easier than the third.
+
+In this note, we give an example for converting the data into COCO format.
+
+**Note**: MMDetection only supports evaluating mask AP of dataset in COCO format for now.
+So for instance segmentation task users should convert the data into coco format.
+
+### COCO annotation format
+
+The necessary keys of COCO format for instance segmentation is as below, for the complete details, please refer [here](https://cocodataset.org/#format-data).
+
+```json
+{
+    "images": [image],
+    "annotations": [annotation],
+    "categories": [category]
+}
+
+
+image = {
+    "id": int,
+    "width": int,
+    "height": int,
+    "file_name": str,
+}
+
+annotation = {
+    "id": int,
+    "image_id": int,
+    "category_id": int,
+    "segmentation": RLE or [polygon],
+    "area": float,
+    "bbox": [x,y,width,height],
+    "iscrowd": 0 or 1,
+}
+
+categories = [{
+    "id": int,
+    "name": str,
+    "supercategory": str,
+}]
+```
+
+Assume we use the balloon dataset.
+After downloading the data, we need to implement a function to convert the annotation format into the COCO format. Then we can use implemented COCODataset to load the data and perform training and evaluation.
+
+If you take a look at the dataset, you will find the dataset format is as below:
+
+```json
+{'base64_img_data': '',
+ 'file_attributes': {},
+ 'filename': '34020010494_e5cb88e1c4_k.jpg',
+ 'fileref': '',
+ 'regions': {'0': {'region_attributes': {},
+   'shape_attributes': {'all_points_x': [1020,
+     1000,
+     994,
+     1003,
+     1023,
+     1050,
+     1089,
+     1134,
+     1190,
+     1265,
+     1321,
+     1361,
+     1403,
+     1428,
+     1442,
+     1445,
+     1441,
+     1427,
+     1400,
+     1361,
+     1316,
+     1269,
+     1228,
+     1198,
+     1207,
+     1210,
+     1190,
+     1177,
+     1172,
+     1174,
+     1170,
+     1153,
+     1127,
+     1104,
+     1061,
+     1032,
+     1020],
+    'all_points_y': [963,
+     899,
+     841,
+     787,
+     738,
+     700,
+     663,
+     638,
+     621,
+     619,
+     643,
+     672,
+     720,
+     765,
+     800,
+     860,
+     896,
+     942,
+     990,
+     1035,
+     1079,
+     1112,
+     1129,
+     1134,
+     1144,
+     1153,
+     1166,
+     1166,
+     1150,
+     1136,
+     1129,
+     1122,
+     1112,
+     1084,
+     1037,
+     989,
+     963],
+    'name': 'polygon'}}},
+ 'size': 1115004}
+```
+
+The annotation is a JSON file where each key indicates an image's all annotations.
+The code to convert the balloon dataset into coco format is as below.
+
+```python
+import os.path as osp
+import mmcv
+
+def convert_balloon_to_coco(ann_file, out_file, image_prefix):
+    data_infos = mmcv.load(ann_file)
+
+    annotations = []
+    images = []
+    obj_count = 0
+    for idx, v in enumerate(mmcv.track_iter_progress(data_infos.values())):
+        filename = v['filename']
+        img_path = osp.join(image_prefix, filename)
+        height, width = mmcv.imread(img_path).shape[:2]
+
+        images.append(dict(
+            id=idx,
+            file_name=filename,
+            height=height,
+            width=width))
+
+        bboxes = []
+        labels = []
+        masks = []
+        for _, obj in v['regions'].items():
+            assert not obj['region_attributes']
+            obj = obj['shape_attributes']
+            px = obj['all_points_x']
+            py = obj['all_points_y']
+            poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)]
+            poly = [p for x in poly for p in x]
+
+            x_min, y_min, x_max, y_max = (
+                min(px), min(py), max(px), max(py))
+
+
+            data_anno = dict(
+                image_id=idx,
+                id=obj_count,
+                category_id=0,
+                bbox=[x_min, y_min, x_max - x_min, y_max - y_min],
+                area=(x_max - x_min) * (y_max - y_min),
+                segmentation=[poly],
+                iscrowd=0)
+            annotations.append(data_anno)
+            obj_count += 1
+
+    coco_format_json = dict(
+        images=images,
+        annotations=annotations,
+        categories=[{'id':0, 'name': 'balloon'}])
+    mmcv.dump(coco_format_json, out_file)
+
+```
+
+Using the function above, users can successfully convert the annotation file into json format, then we can use `CocoDataset` to train and evaluate the model.
+
+## Prepare a config
+
+The second step is to prepare a config thus the dataset could be successfully loaded. Assume that we want to use Mask R-CNN with FPN, the config to train the detector on balloon dataset is as below. Assume the config is under directory `configs/balloon/` and named as `mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py`, the config is as below.
+
+```python
+# The new config inherits a base config to highlight the necessary modification
+_base_ = 'mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py'
+
+# We also need to change the num_classes in head to match the dataset's annotation
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=1),
+        mask_head=dict(num_classes=1)))
+
+# Modify dataset related settings
+dataset_type = 'COCODataset'
+classes = ('balloon',)
+data = dict(
+    train=dict(
+        img_prefix='balloon/train/',
+        classes=classes,
+        ann_file='balloon/train/annotation_coco.json'),
+    val=dict(
+        img_prefix='balloon/val/',
+        classes=classes,
+        ann_file='balloon/val/annotation_coco.json'),
+    test=dict(
+        img_prefix='balloon/val/',
+        classes=classes,
+        ann_file='balloon/val/annotation_coco.json'))
+
+# We can use the pre-trained Mask RCNN model to obtain higher performance
+load_from = 'checkpoints/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'
+```
+
+This checkpoint file can be downloaded [here](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth)
+
+## Train a new model
+
+To train a model with the new config, you can simply run
+
+```shell
+python tools/train.py configs/balloon/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py
+```
+
+For more detailed usages, please refer to the [Case 1](1_exist_data_model.md).
+
+## Test and inference
+
+To test the trained model, you can simply run
+
+```shell
+python tools/test.py configs/balloon/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py work_dirs/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon/latest.pth --eval bbox segm
+```
+
+For more detailed usages, please refer to the [Case 1](1_exist_data_model.md).
diff --git a/docs/en/3_exist_data_new_model.md b/docs/en/3_exist_data_new_model.md
new file mode 100755
index 0000000..b34c133
--- /dev/null
+++ b/docs/en/3_exist_data_new_model.md
@@ -0,0 +1,283 @@
+# 3: Train with customized models and standard datasets
+
+In this note, you will know how to train, test and inference your own customized models under standard datasets. We use the cityscapes dataset to train a customized Cascade Mask R-CNN R50 model as an example to demonstrate the whole process, which using [`AugFPN`](https://github.com/Gus-Guo/AugFPN) to replace the default `FPN` as neck, and add `Rotate` or `Translate` as training-time auto augmentation.
+
+The basic steps are as below:
+
+1. Prepare the standard dataset
+2. Prepare your own customized model
+3. Prepare a config
+4. Train, test, and inference models on the standard dataset.
+
+## Prepare the standard dataset
+
+In this note, as we use the standard cityscapes dataset as an example.
+
+It is recommended to symlink the dataset root to `$MMDETECTION/data`.
+If your folder structure is different, you may need to change the corresponding paths in config files.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   ├── cityscapes
+│   │   ├── annotations
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── VOCdevkit
+│   │   ├── VOC2007
+│   │   ├── VOC2012
+
+```
+
+Or you can set your dataset root through
+
+```bash
+export MMDET_DATASETS=$data_root
+```
+
+We will replace dataset root with `$MMDET_DATASETS`, so you don't have to modify the corresponding path in config files.
+
+The cityscapes annotations have to be converted into the coco format using `tools/dataset_converters/cityscapes.py`:
+
+```shell
+pip install cityscapesscripts
+python tools/dataset_converters/cityscapes.py ./data/cityscapes --nproc 8 --out-dir ./data/cityscapes/annotations
+```
+
+Currently the config files in `cityscapes` use COCO pre-trained weights to initialize.
+You could download the pre-trained models in advance if network is unavailable or slow, otherwise it would cause errors at the beginning of training.
+
+## Prepare your own customized model
+
+The second step is to use your own module or training setting. Assume that we want to implement a new neck called `AugFPN` to replace with the default `FPN` under the existing detector Cascade Mask R-CNN R50. The following implements`AugFPN` under MMDetection.
+
+### 1. Define a new neck (e.g. AugFPN)
+
+Firstly create a new file `mmdet/models/necks/augfpn.py`.
+
+```python
+from ..builder import NECKS
+
+@NECKS.register_module()
+class AugFPN(nn.Module):
+
+    def __init__(self,
+                in_channels,
+                out_channels,
+                num_outs,
+                start_level=0,
+                end_level=-1,
+                add_extra_convs=False):
+        pass
+
+    def forward(self, inputs):
+        # implementation is ignored
+        pass
+```
+
+### 2. Import the module
+
+You can either add the following line to `mmdet/models/necks/__init__.py`,
+
+```python
+from .augfpn import AugFPN
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet.models.necks.augfpn.py'],
+    allow_failed_imports=False)
+```
+
+to the config file and avoid modifying the original code.
+
+### 3. Modify the config file
+
+```python
+neck=dict(
+    type='AugFPN',
+    in_channels=[256, 512, 1024, 2048],
+    out_channels=256,
+    num_outs=5)
+```
+
+For more detailed usages about customize your own models (e.g. implement a new backbone, head, loss, etc) and runtime training settings (e.g. define a new optimizer, use gradient clip, customize training schedules and hooks, etc), please refer to the guideline [Customize Models](tutorials/customize_models.md) and [Customize Runtime Settings](tutorials/customize_runtime.md) respectively.
+
+## Prepare a config
+
+The third step is to prepare a config for your own training setting. Assume that we want to add `AugFPN` and `Rotate` or `Translate` augmentation to existing Cascade Mask R-CNN R50 to train the cityscapes dataset, and assume the config is under directory `configs/cityscapes/` and named as `cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py`, the config is as below.
+
+```python
+# The new config inherits the base configs to highlight the necessary modification
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    # set None to avoid loading ImageNet pretrained backbone,
+    # instead here we set `load_from` to load from COCO pretrained detectors.
+    backbone=dict(init_cfg=None),
+    # replace neck from defaultly `FPN` to our new implemented module `AugFPN`
+    neck=dict(
+        type='AugFPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    # We also need to change the num_classes in head from 80 to 8, to match the
+    # cityscapes dataset's annotation. This modification involves `bbox_head` and `mask_head`.
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                # change the number of classes from defaultly COCO to cityscapes
+                num_classes=8,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                # change the number of classes from defaultly COCO to cityscapes
+                num_classes=8,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                # change the number of classes from defaultly COCO to cityscapes
+                num_classes=8,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            # change the number of classes from defaultly COCO to cityscapes
+            num_classes=8,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+
+# over-write `train_pipeline` for new added `AutoAugment` training setting
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [dict(
+                 type='Rotate',
+                 level=5,
+                 img_fill_val=(124, 116, 104),
+                 prob=0.5,
+                 scale=1)
+            ],
+            [dict(type='Rotate', level=7, img_fill_val=(124, 116, 104)),
+             dict(
+                 type='Translate',
+                 level=5,
+                 prob=0.5,
+                 img_fill_val=(124, 116, 104))
+            ],
+        ]),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+
+# set batch_size per gpu, and set new training pipeline
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=3,
+    # over-write `pipeline` with new training pipeline setting
+    train=dict(dataset=dict(pipeline=train_pipeline)))
+
+# Set optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# Set customized learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8])
+runner = dict(type='EpochBasedRunner', max_epochs=10)
+
+# We can use the COCO pretrained Cascade Mask R-CNN R50 model for more stable performance initialization
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth'
+```
+
+## Train a new model
+
+To train a model with the new config, you can simply run
+
+```shell
+python tools/train.py configs/cityscapes/cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py
+```
+
+For more detailed usages, please refer to the [Case 1](1_exist_data_model.md).
+
+## Test and inference
+
+To test the trained model, you can simply run
+
+```shell
+python tools/test.py configs/cityscapes/cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py work_dirs/cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py/latest.pth --eval bbox segm
+```
+
+For more detailed usages, please refer to the [Case 1](1_exist_data_model.md).
diff --git a/docs/en/Makefile b/docs/en/Makefile
new file mode 100755
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/en/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/en/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css
new file mode 100755
index 0000000..57ed0ad
--- /dev/null
+++ b/docs/en/_static/css/readthedocs.css
@@ -0,0 +1,6 @@
+.header-logo {
+    background-image: url("../image/mmdet-logo.png");
+    background-size: 156px 40px;
+    height: 40px;
+    width: 156px;
+}
diff --git a/docs/en/_static/image/mmdet-logo.png b/docs/en/_static/image/mmdet-logo.png
new file mode 100755
index 0000000..58e2b5e
Binary files /dev/null and b/docs/en/_static/image/mmdet-logo.png differ
diff --git a/docs/en/api.rst b/docs/en/api.rst
new file mode 100755
index 0000000..e61c663
--- /dev/null
+++ b/docs/en/api.rst
@@ -0,0 +1,108 @@
+mmdet.apis
+--------------
+.. automodule:: mmdet.apis
+    :members:
+
+mmdet.core
+--------------
+
+anchor
+^^^^^^^^^^
+.. automodule:: mmdet.core.anchor
+    :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmdet.core.bbox
+    :members:
+
+export
+^^^^^^^^^^
+.. automodule:: mmdet.core.export
+    :members:
+
+mask
+^^^^^^^^^^
+.. automodule:: mmdet.core.mask
+    :members:
+
+evaluation
+^^^^^^^^^^
+.. automodule:: mmdet.core.evaluation
+    :members:
+
+post_processing
+^^^^^^^^^^^^^^^
+.. automodule:: mmdet.core.post_processing
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmdet.core.utils
+    :members:
+
+mmdet.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmdet.datasets
+    :members:
+
+pipelines
+^^^^^^^^^^
+.. automodule:: mmdet.datasets.pipelines
+    :members:
+
+samplers
+^^^^^^^^^^
+.. automodule:: mmdet.datasets.samplers
+    :members:
+
+api_wrappers
+^^^^^^^^^^^^
+.. automodule:: mmdet.datasets.api_wrappers
+    :members:
+
+mmdet.models
+--------------
+
+detectors
+^^^^^^^^^^
+.. automodule:: mmdet.models.detectors
+    :members:
+
+backbones
+^^^^^^^^^^
+.. automodule:: mmdet.models.backbones
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmdet.models.necks
+    :members:
+
+dense_heads
+^^^^^^^^^^^^
+.. automodule:: mmdet.models.dense_heads
+    :members:
+
+roi_heads
+^^^^^^^^^^
+.. automodule:: mmdet.models.roi_heads
+    :members:
+
+losses
+^^^^^^^^^^
+.. automodule:: mmdet.models.losses
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmdet.models.utils
+    :members:
+
+mmdet.utils
+--------------
+.. automodule::mmdet.utils
+    :members:
diff --git a/docs/en/changelog.md b/docs/en/changelog.md
new file mode 100755
index 0000000..473aec3
--- /dev/null
+++ b/docs/en/changelog.md
@@ -0,0 +1,1897 @@
+## Changelog
+
+### v2.28.2 (24/2/2023)
+
+#### New Features and Improvements
+
+- Add Twitter, Discord, Medium and YouTube link (#9774)
+- Update `customize_runtime.md` (#9797)
+
+#### Bug Fixes
+
+- Fix `WIDERFace SSD` loss for Nan problem (#9734)
+- Fix missing API documentation in Readthedoc (#9729)
+- Fix the configuration file and log path of CenterNet (#9791)
+
+#### Contributors
+
+A total of 4 developers contributed to this release.
+Thanks @co63oc, @Ginray, @vansin, @RangiLyu
+
+### v2.28.1 (1/2/2023)
+
+#### Bug Fixes
+
+- Enable to set float mlp_ratio in SwinTransformer (#8670)
+- Fix import error that causes training failure (#9694)
+- Fix isort version in lint (#9685)
+- Fix init_cfg of YOLOF (#8243)
+
+#### Contributors
+
+A total of 4 developers contributed to this release.
+Thanks @triple-Mu, @i-aki-y, @twmht, @RangiLyu
+
+### v2.28.0 (28/1/2023)
+
+#### Highlights
+
+- Support Objects365 Dataset and Separated and Occluded COCO metric
+- Support acceleration of RetinaNet and SSD on Ascend
+- Deprecate the support of Python 3.6
+
+#### New Features and Improvements
+
+- Support Objects365 Dataset (#7525)
+- Support [Separated and Occluded COCO metric](https://arxiv.org/abs/2210.10046) (#9574)
+- Support acceleration of RetinaNet and SSD on Ascend with documentation (#9648, #9614)
+- Added missing `-` to `--format-only` in documentation.
+
+#### Deprecations
+
+- Upgrade the minimum Python version to 3.7, the support of Python 3.6 is no longer guaranteed (#9604)
+
+#### Bug Fixes
+
+- Fix validation loss logging by (#9663)
+- Fix inconsistent float precision between mmdet and mmcv (#9570)
+- Fix argument name for fp32 in `DeformableDETRHead` (#9607)
+- Fix typo of all config file path in Metafile.yml (#9627)
+
+#### Contributors
+
+A total of 11 developers contributed to this release.
+Thanks @eantono, @akstt, @@lpizzinidev, @RangiLyu, @kbumsik, @tianleiSHI, @nijkah, @BIGWangYuDong, @wangjiangben-hw, @@jamiechoi1995, @ZwwWayne
+
+## New Contributors
+
+- @kbumsik made their first contribution in https://github.com/open-mmlab/mmdetection/pull/9627
+- @akstt made their first contribution in https://github.com/open-mmlab/mmdetection/pull/9614
+- @lpizzinidev made their first contribution in https://github.com/open-mmlab/mmdetection/pull/9649
+- @eantono made their first contribution in https://github.com/open-mmlab/mmdetection/pull/9663
+
+### v2.27.0 (5/1/2023)
+
+#### Highlights
+
+- Support receptive field search of CNN models([TPAMI 2022: RF-Next](http://mftp.mmcheng.net/Papers/22TPAMI-ActionSeg.pdf)) (#8191)
+
+#### Bug Fixes
+
+- Fix deadlock issue related with MMDetWandbHook (#9476)
+
+#### Improvements
+
+- Add minimum GitHub token permissions for workflows (#8928)
+- Delete compatible code for parrots in roi extractor (#9503)
+- Deprecate np.bool Type Alias (#9498)
+- Replace numpy transpose with torch permute to speed-up data pre-processing (#9533)
+
+#### Documents
+
+- Fix typo in docs/zh_cn/tutorials/config.md (#9416)
+- Fix Faster RCNN FP16 config link in README (#9366)
+
+#### Contributors
+
+A total of 12 developers contributed to this release.
+Thanks @Min-Sheng, @gasvn, @lzyhha, @jbwang1997, @zachcoleman, @chenyuwang814, @MilkClouds, @Fizzez, @boahc077, @apatsekin, @zytx121, @DonggeunYu
+
+### v2.26.0 (23/11/2022)
+
+#### Highlights
+
+- Support training on [NPU](docs/en/device/npu.md) (#9267)
+
+#### Bug Fixes
+
+- Fix RPN visualization (#9151)
+- Fix readthedocs by freezing the dependency versions (#9154)
+- Fix device argument error in MMDet_Tutorial.ipynb (#9112)
+- Fix solov2 cannot dealing with empty gt image (#9185)
+- Fix random flipping ratio comparison of mixup image (#9336)
+
+#### Improvements
+
+- Complement necessary argument of seg_suffix of cityscapes (#9330)
+- Support copy paste based on bbox when there is no gt mask (#8905)
+- Make scipy as a default dependency in runtime (#9186)
+
+#### Documents
+
+- Delete redundant Chinese characters in docs (#9175)
+- Add MMEval in README (#9217)
+
+#### Contributors
+
+A total of 11 developers contributed to this release.
+Thanks @wangjiangben-hw, @motokimura, @AdorableJiang, @BainOuO, @JarvisKevin, @wanghonglie, @zytx121, @BIGWangYuDong, @hhaAndroid, @RangiLyu, @ZwwWayne
+
+### v2.25.3 (25/10/2022)
+
+#### Bug Fixes
+
+- Skip remote sync when wandb is offline (#8755)
+- Fix jpg to png bug when using seg maps (#9078)
+
+#### Improvements
+
+- Fix typo in warning (#8844)
+- Fix CI for timm, pycocotools, onnx (#9034)
+- Upgrade pre-commit hooks (#8964)
+
+#### Documents
+
+- Update BoundedIoULoss config in readme (#8808)
+- Fix Faster R-CNN Readme (#8803)
+- Update location of test_cfg and train_cfg (#8792)
+- Fix issue template (#8966)
+- Update random sampler docstring (#9033)
+- Fix wrong image link (#9054)
+- Fix FPG readme (#9041)
+
+#### Contributors
+
+A total of 13 developers contributed to this release.
+Thanks @Zheng-LinXiao, @i-aki-y, @fbagci, @sudoAimer, @Czm369, @DrRyanHuang, @RangiLyu, @wanghonglie, @shinya7y, @Ryoo72, @akshaygulabrao, @gy-7, @Neesky
+
+### v2.25.2 (15/9/2022)
+
+#### Bug Fixes
+
+- Fix DyDCNv2 RuntimeError (#8485)
+- Fix repeated import of CascadeRPNHead (#8578)
+- Fix absolute positional embedding of swin backbone (#8127)
+- Fix get train_pipeline method of val workflow (#8575)
+
+#### Improvements
+
+- Upgrade onnxsim to at least 0.4.0 (#8383)
+- Support tuple format in analyze_results script (#8549)
+- Fix floordiv warning (#8648)
+
+#### Documents
+
+- Fix typo in HTC link (#8487)
+- Fix docstring of `BboxOverlaps2D` (#8512)
+- Added missed Chinese tutorial link (#8564)
+- Fix mistakes in gaussian radius formula (#8607)
+- Update config documentation about how to Add WandB Hook (#8663)
+- Add mmengine link in readme (#8799)
+- Update issue template (#8802)
+
+#### Contributors
+
+A total of 16 developers contributed to this release.
+Thanks @daquexian, @lyq10085, @ZwwWayne, @fbagci, @BubblyYi, @fathomson, @ShunchiZhang, @ceasona, @Happylkx, @normster, @chhluo, @Lehsuby, @JiayuXu0, @Nourollah, @hewanru-bit, @RangiLyu
+
+### v2.25.1 (29/7/2022)
+
+#### Bug Fixes
+
+- Fix single GPU distributed training of cuda device specifying (#8176)
+- Fix PolygonMask bug in FilterAnnotations (#8136)
+- Fix mdformat version to support python3.6 (#8195)
+- Fix GPG key error in Dockerfile (#8215)
+- Fix `WandbLoggerHook` error (#8273)
+- Fix Pytorch 1.10 incompatibility issues (#8439)
+
+#### Improvements
+
+- Add `mim` to `extras_require` in setup.py (#8194)
+- Support get image shape on macOS (#8434)
+- Add test commands of `mim` in CI (#8230 & #8240)
+- Update `maskformer` to be compatible when cfg is a dictionary (#8263)
+- Clean `Pillow` version check in CI (#8229)
+
+#### Documents
+
+- Change example hook name in tutorials (#8118)
+- Update projects (#8120)
+- Update metafile and release new models (#8294)
+- Add download link in tutorials (#8391)
+
+#### Contributors
+
+A total of 15 developers contributed to this release.
+Thanks @ZwwWayne, @ayulockin, @Mxbonn, @p-mishra1, @Youth-Got, @MiXaiLL76, @chhluo, @jbwang1997, @atinfinity, @shinya7y, @duanzhihua, @STLAND-admin, @BIGWangYuDong, @grimoire, @xiaoyuan0203
+
+### v2.25.0 (31/5/2022)
+
+#### Highlights
+
+- Support dedicated `WandbLogger` hook
+- Support [ConvNeXt](configs/convnext), [DDOD](configs/ddod), [SOLOv2](configs/solov2)
+- Support [Mask2Former](configs/mask2former) for instance segmentation
+- Rename [config files of Mask2Former](configs/mask2former)
+
+#### Backwards incompatible changes
+
+- Rename [config files of Mask2Former](configs/mask2former) (#7571)
+
+  <table align="center">
+    <thead>
+        <tr align='center'>
+            <td>before v2.25.0</td>
+            <td>after v2.25.0</td>
+        </tr>
+    </thead>
+    <tbody><tr valign='top'>
+    <th>
+
+  - `mask2former_xxx_coco.py` represents config files for **panoptic segmentation**.
+
+  </th>
+    <th>
+
+  - `mask2former_xxx_coco.py` represents config files for **instance segmentation**.
+  - `mask2former_xxx_coco-panoptic.py` represents config files for **panoptic segmentation**.
+
+  </th></tr>
+  </tbody></table>
+
+#### New Features
+
+- Support [ConvNeXt](https://arxiv.org/abs/2201.03545) (#7281)
+- Support [DDOD](https://arxiv.org/abs/2107.02963) (#7279)
+- Support [SOLOv2](https://arxiv.org/abs/2003.10152) (#7441)
+- Support [Mask2Former](https://arxiv.org/abs/2112.01527) for instance segmentation (#7571, #8032)
+
+#### Bug Fixes
+
+- Enable YOLOX training on different devices (#7912)
+- Fix the log plot error when evaluation with `interval != 1` (#7784)
+- Fix RuntimeError of HTC (#8083)
+
+#### Improvements
+
+- Support dedicated `WandbLogger` hook (#7459)
+
+  Users can set
+
+  ```python
+  cfg.log_config.hooks = [
+    dict(type='MMDetWandbHook',
+         init_kwargs={'project': 'MMDetection-tutorial'},
+         interval=10,
+         log_checkpoint=True,
+         log_checkpoint_metadata=True,
+         num_eval_images=10)]
+  ```
+
+  in the config to use `MMDetWandbHook`. Example can be found in this [colab tutorial](https://colab.research.google.com/drive/1RCSXHZwDZvakFh3eo9RuNrJbCGqD0dru?usp=sharing#scrollTo=WTEdPDRaBz2C)
+
+- Add `AvoidOOM` to avoid OOM (#7434, #8091)
+
+  Try to use `AvoidCUDAOOM` to avoid GPU out of memory. It will first retry after calling `torch.cuda.empty_cache()`. If it still fails, it will then retry by converting the type of inputs to FP16 format. If it still fails, it will try to copy inputs from GPUs to CPUs to continue computing. Try AvoidOOM in code to make the code continue to run when GPU memory runs out:
+
+  ```python
+  from mmdet.utils import AvoidCUDAOOM
+
+  output = AvoidCUDAOOM.retry_if_cuda_oom(some_function)(input1, input2)
+  ```
+
+  Users can also try `AvoidCUDAOOM` as a decorator to make the code continue to run when GPU memory runs out:
+
+  ```python
+  from mmdet.utils import AvoidCUDAOOM
+
+  @AvoidCUDAOOM.retry_if_cuda_oom
+  def function(*args, **kwargs):
+      ...
+      return xxx
+  ```
+
+- Support reading `gpu_collect` from `cfg.evaluation.gpu_collect` (#7672)
+
+- Speedup the Video Inference by Accelerating data-loading Stage (#7832)
+
+- Support replacing the `${key}` with the value of `cfg.key` (#7492)
+
+- Accelerate result analysis in `analyze_result.py`. The evaluation time is speedup by 10 ~ 15 times and only tasks 10 ~ 15 minutes now. (#7891)
+
+- Support to set `block_dilations` in `DilatedEncoder` (#7812)
+
+- Support panoptic segmentation result analysis (#7922)
+
+- Release DyHead with Swin-Large backbone (#7733)
+
+- Documentations updating and adding
+
+  - Fix wrong default type of `act_cfg` in `SwinTransformer` (#7794)
+  - Fix text errors in the tutorials (#7959)
+  - Rewrite the [installation guide](docs/en/get_started.md) (#7897)
+  - [Useful hooks](docs/en/tutorials/useful_hooks.md) (#7810)
+  - Fix heading anchor in documentation  (#8006)
+  - Replace `markdownlint` with `mdformat` for avoiding installing ruby (#8009)
+
+#### Contributors
+
+A total of 20 developers contributed to this release.
+
+Thanks @ZwwWayne, @DarthThomas, @solyaH, @LutingWang, @chenxinfeng4, @Czm369, @Chenastron, @chhluo, @austinmw, @Shanyaliux @hellock, @Y-M-Y, @jbwang1997, @hhaAndroid, @Irvingao, @zhanggefan, @BIGWangYuDong, @Keiku, @PeterVennerstrom, @ayulockin
+
+### v2.24.0 (26/4/2022)
+
+#### Highlights
+
+- Support [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https://arxiv.org/abs/2012.07177)
+- Support automatically scaling LR according to GPU number and samples per GPU
+- Support Class Aware Sampler that improves performance on OpenImages Dataset
+
+#### New Features
+
+- Support [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https://arxiv.org/abs/2012.07177), see [example configs](configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py) (#7501)
+
+- Support Class Aware Sampler, users can set
+
+  ```python
+  data=dict(train_dataloader=dict(class_aware_sampler=dict(num_sample_class=1))))
+  ```
+
+  in the config to use `ClassAwareSampler`. Examples can be found in [the configs of OpenImages Dataset](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py).  (#7436)
+
+- Support automatically scaling LR according to GPU number and samples per GPU. (#7482)
+  In each config, there is a corresponding config of auto-scaling LR as below,
+
+  ```python
+  auto_scale_lr = dict(enable=True, base_batch_size=N)
+  ```
+
+  where `N` is the batch size used for the current learning rate in the config (also equals to `samples_per_gpu` * gpu number to train this config).
+  By default, we set `enable=False` so that the original usages will not be affected. Users can set `enable=True` in each config or add `--auto-scale-lr` after the command line to enable this feature and should check the correctness of `base_batch_size` in customized configs.
+
+- Support setting dataloader arguments in config and add functions to handle config compatibility. (#7668)
+  The comparison between the old and new usages is as below.
+
+  <table align="center">
+    <thead>
+        <tr align='center'>
+            <td>v2.23.0</td>
+            <td>v2.24.0</td>
+        </tr>
+    </thead>
+    <tbody><tr valign='top'>
+    <th>
+
+  ```python
+  data = dict(
+      samples_per_gpu=64, workers_per_gpu=4,
+      train=dict(type='xxx', ...),
+      val=dict(type='xxx', samples_per_gpu=4, ...),
+      test=dict(type='xxx', ...),
+  )
+  ```
+
+  </th>
+    <th>
+
+  ```python
+  # A recommended config that is clear
+  data = dict(
+      train=dict(type='xxx', ...),
+      val=dict(type='xxx', ...),
+      test=dict(type='xxx', ...),
+      # Use different batch size during inference.
+      train_dataloader=dict(samples_per_gpu=64, workers_per_gpu=4),
+      val_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2),
+      test_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2),
+  )
+
+  # Old style still works but allows to set more arguments about data loaders
+  data = dict(
+      samples_per_gpu=64,  # only works for train_dataloader
+      workers_per_gpu=4,  # only works for train_dataloader
+      train=dict(type='xxx', ...),
+      val=dict(type='xxx', ...),
+      test=dict(type='xxx', ...),
+      # Use different batch size during inference.
+      val_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2),
+      test_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2),
+  )
+  ```
+
+  </th></tr>
+  </tbody></table>
+
+- Support memory profile hook. Users can use it to monitor the memory usages during training as below (#7560)
+
+  ```python
+  custom_hooks = [
+      dict(type='MemoryProfilerHook', interval=50)
+  ]
+  ```
+
+- Support to run on PyTorch with MLU chip (#7578)
+
+- Support re-spliting data batch with tag (#7641)
+
+- Support the `DiceCost` used by [K-Net](https://arxiv.org/abs/2106.14855) in `MaskHungarianAssigner` (#7716)
+
+- Support splitting COCO data for Semi-supervised object detection (#7431)
+
+- Support Pathlib for Config.fromfile (#7685)
+
+- Support to use file client in OpenImages dataset (#7433)
+
+- Add a probability parameter to Mosaic transformation (#7371)
+
+- Support specifying interpolation mode in `Resize` pipeline (#7585)
+
+#### Bug Fixes
+
+- Avoid invalid bbox after deform_sampling (#7567)
+- Fix the issue that argument color_theme does not take effect when exporting confusion matrix (#7701)
+- Fix the `end_level` in Necks, which should be the index of the end input backbone level (#7502)
+- Fix the bug that `mix_results` may be None in `MultiImageMixDataset` (#7530)
+- Fix the bug in ResNet plugin when two plugins are used (#7797)
+
+#### Improvements
+
+- Enhance `load_json_logs` of analyze_logs.py for resumed training logs (#7732)
+- Add argument `out_file` in image_demo.py (#7676)
+- Allow mixed precision training with `SimOTAAssigner` (#7516)
+- Updated INF to 100000.0 to be the same as that in the official YOLOX (#7778)
+- Add documentations of:
+  - how to get channels of a new backbone (#7642)
+  - how to unfreeze the backbone network (#7570)
+  - how to train fast_rcnn model (#7549)
+  - proposals in Deformable DETR (#7690)
+  - from-scratch install script in get_started.md (#7575)
+- Release pre-trained models of
+  - [Mask2Former](configs/mask2former) (#7595, #7709)
+  - RetinaNet with ResNet-18 and release models (#7387)
+  - RetinaNet with EfficientNet backbone (#7646)
+
+#### Contributors
+
+A total of 27 developers contributed to this release.
+Thanks @jovialio, @zhangsanfeng2022, @HarryZJ, @jamiechoi1995, @nestiank, @PeterH0323, @RangeKing, @Y-M-Y, @mattcasey02, @weiji14, @Yulv-git, @xiefeifeihu, @FANG-MING, @meng976537406, @nijkah, @sudz123, @CCODING04, @SheffieldCao, @Czm369, @BIGWangYuDong, @zytx121, @jbwang1997, @chhluo, @jshilong, @RangiLyu, @hhaAndroid, @ZwwWayne
+
+### v2.23.0 (28/3/2022)
+
+#### Highlights
+
+- Support Mask2Former: [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+- Support EfficientNet: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)
+- Support setting data root through environment variable `MMDET_DATASETS`, users don't have to modify the corresponding path in config files anymore.
+- Find a good recipe for fine-tuning high precision ResNet backbone pre-trained by Torchvision.
+
+#### New Features
+
+- Support [Mask2Former](configs/mask2former)(#6938)(#7466)(#7471)
+- Support [EfficientNet](configs/efficientnet) (#7514)
+- Support setting data root through environment variable `MMDET_DATASETS`, users don't have to modify the corresponding path in config files anymore. (#7386)
+- Support setting different seeds to different ranks (#7432)
+- Update the `dist_train.sh` so that the script can be used to support launching multi-node training on machines without slurm (#7415)
+- Find a good recipe for fine-tuning high precision ResNet backbone pre-trained by Torchvision (#7489)
+
+#### Bug Fixes
+
+- Fix bug in VOC unit test which removes the data directory (#7270)
+- Adjust the order of `get_classes` and `FileClient` (#7276)
+- Force the inputs of `get_bboxes` in yolox_head to float32 (#7324)
+- Fix misplaced arguments in LoadPanopticAnnotations (#7388)
+- Fix reduction=mean in CELoss. (#7449)
+- Update unit test of CrossEntropyCost (#7537)
+- Fix memory leaking in panpotic segmentation evaluation (#7538)
+- Fix the bug of shape broadcast in YOLOv3 (#7551)
+
+#### Improvements
+
+- Add Chinese version of onnx2tensorrt.md (#7219)
+- Update colab tutorials (#7310)
+- Update information about Localization Distillation (#7350)
+- Add Chinese version of `finetune.md` (#7178)
+- Update YOLOX log for non square input (#7235)
+- Add `nproc` in `coco_panoptic.py` for panoptic quality computing (#7315)
+- Allow to set channel_order in LoadImageFromFile (#7258)
+- Take point sample related functions out of mask_point_head (#7353)
+- Add instance evaluation for coco_panoptic (#7313)
+- Enhance the robustness of analyze_logs.py (#7407)
+- Supplementary notes of sync_random_seed (#7440)
+- Update docstring of cross entropy loss (#7472)
+- Update pascal voc result (#7503)
+- We create How-to documentation to record any questions about How to xxx. In this version, we added
+  - How to use Mosaic augmentation (#7507)
+  - How to use backbone in mmcls (#7438)
+  - How to produce and submit the prediction results of panoptic segmentation models on COCO test-dev set (#7430))
+
+#### Contributors
+
+A total of 27 developers contributed to this release.
+Thanks @ZwwWayne, @haofanwang, @shinya7y, @chhluo, @yangrisheng, @triple-Mu, @jbwang1997, @HikariTJU, @imflash217, @274869388, @zytx121, @matrixgame2018, @jamiechoi1995, @BIGWangYuDong, @JingweiZhang12, @Xiangxu-0103, @hhaAndroid, @jshilong, @osbm, @ceroytres, @bunge-bedstraw-herb, @Youth-Got, @daavoo, @jiangyitong, @RangiLyu, @CCODING04, @yarkable
+
+### v2.22.0 (24/2/2022)
+
+#### Highlights
+
+- Support MaskFormer: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) (#7212)
+- Support DyHead: [Dynamic Head: Unifying Object Detection Heads with Attentions](https://arxiv.org/abs/2106.08322) (#6823)
+- Release a good recipe of using ResNet in object detectors pre-trained by [ResNet Strikes Back](https://arxiv.org/abs/2110.00476), which consistently brings about 3~4 mAP improvements over RetinaNet, Faster/Mask/Cascade Mask R-CNN (#7001)
+- Support [Open Images Dataset](https://storage.googleapis.com/openimages/web/index.html) (#6331)
+- Support TIMM backbone: [PyTorch Image Models](https://github.com/rwightman/pytorch-image-models) (#7020)
+
+#### New Features
+
+- Support [MaskFormer](configs/maskformer) (#7212)
+- Support [DyHead](configs/dyhead) (#6823)
+- Support [ResNet Strikes Back](configs/resnet_strikes_back) (#7001)
+- Support [OpenImages Dataset](configs/openimages) (#6331)
+- Support [TIMM backbone](configs/timm_example) (#7020)
+- Support visualization for Panoptic Segmentation (#7041)
+
+#### Breaking Changes
+
+In order to support the visualization for Panoptic Segmentation, the `num_classes` can not be `None` when using the `get_palette` function to determine whether to use the panoptic palette.
+
+#### Bug Fixes
+
+- Fix bug for the best checkpoints can not be saved when the `key_score` is None (#7101)
+- Fix MixUp transform filter boxes failing case (#7080)
+- Add missing properties in SABLHead (#7091)
+- Fix bug when NaNs exist in confusion matrix (#7147)
+- Fix PALETTE AttributeError in downstream task (#7230)
+
+#### Improvements
+
+- Speed up SimOTA matching (#7098)
+- Add Chinese translation of `docs_zh-CN/tutorials/init_cfg.md` (#7188)
+
+#### Contributors
+
+A total of 20 developers contributed to this release.
+Thanks @ZwwWayne, @hhaAndroid, @RangiLyu, @AronLin, @BIGWangYuDong, @jbwang1997, @zytx121, @chhluo, @shinya7y, @LuooChen, @dvansa, @siatwangmin, @del-zhenwu, @vikashranjan26, @haofanwang, @jamiechoi1995, @HJoonKwon, @yarkable, @zhijian-liu, @RangeKing
+
+### v2.21.0 (8/2/2022)
+
+### Breaking Changes
+
+To standardize the contents in config READMEs and meta files of OpenMMLab projects, the READMEs and meta files in each config directory have been significantly changed. The template will be released in the future, for now, you can refer to the examples of README for [algorithm](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/README.md), [dataset](https://github.com/open-mmlab/mmdetection/blob/master/configs/deepfashion/README.md) and [backbone](https://github.com/open-mmlab/mmdetection/blob/master/configs/regnet/README.md). To align with the standard, the configs in dcn are put into to two directories named `dcn` and `dcnv2`.
+
+#### New Features
+
+- Allow to customize colors of different classes during visualization (#6716)
+- Support CPU training (#7016)
+- Add download script of COCO, LVIS, and VOC dataset (#7015)
+
+#### Bug Fixes
+
+- Fix weight conversion issue of RetinaNet with Swin-S (#6973)
+- Update `__repr__` of `Compose` (#6951)
+- Fix BadZipFile Error when build docker (#6966)
+- Fix bug in non-distributed multi-gpu training/testing (#7019)
+- Fix bbox clamp in PyTorch 1.10 (#7074)
+- Relax the requirement of PALETTE in dataset wrappers (#7085)
+- Keep the same weights before reassign in the PAA head (#7032)
+- Update code demo in doc (#7092)
+
+#### Improvements
+
+- Speed-up training by allow to set variables of multi-processing (#6974, #7036)
+- Add links of Chinese tutorials in readme (#6897)
+- Disable cv2 multiprocessing by default for acceleration (#6867)
+- Deprecate the support for "python setup.py test" (#6998)
+- Re-organize metafiles and config readmes (#7051)
+- Fix None grad problem during training TOOD by adding `SigmoidGeometricMean` (#7090)
+
+#### Contributors
+
+A total of 26 developers contributed to this release.
+Thanks @del-zhenwu, @zimoqingfeng, @srishilesh, @imyhxy, @jenhaoyang, @jliu-ac, @kimnamu, @ShengliLiu, @garvan2021, @ciusji, @DIYer22, @kimnamu, @q3394101, @zhouzaida, @gaotongxiao, @topsy404, @AntoAndGar, @jbwang1997, @nijkah, @ZwwWayne, @Czm369, @jshilong, @RangiLyu, @BIGWangYuDong, @hhaAndroid, @AronLin
+
+### v2.20.0 (27/12/2021)
+
+#### New Features
+
+- Support [TOOD](configs/tood/README.md): Task-aligned One-stage Object Detection (ICCV 2021 Oral) (#6746)
+- Support resuming from the latest checkpoint automatically (#6727)
+
+#### Bug Fixes
+
+- Fix wrong bbox `loss_weight` of the PAA head (#6744)
+- Fix the padding value of `gt_semantic_seg` in batch collating (#6837)
+- Fix test error of lvis when using `classwise` (#6845)
+- Avoid BC-breaking of `get_local_path`  (#6719)
+- Fix bug in `sync_norm_hook` when the BN layer does not exist (#6852)
+- Use pycocotools directly no matter what platform it is (#6838)
+
+#### Improvements
+
+- Add unit test for SimOTA with no valid bbox (#6770)
+- Use precommit to check readme (#6802)
+- Support selecting GPU-ids in non-distributed testing time (#6781)
+
+#### Contributors
+
+A total of 16 developers contributed to this release.
+Thanks @ZwwWayne, @Czm369, @jshilong, @RangiLyu, @BIGWangYuDong, @hhaAndroid, @jamiechoi1995, @AronLin, @Keiku, @gkagkos, @fcakyon, @www516717402, @vansin, @zactodd, @kimnamu, @jenhaoyang
+
+### v2.19.1 (14/12/2021)
+
+#### New Features
+
+- Release [YOLOX](configs/yolox/README.md) COCO pretrained models (#6698)
+
+#### Bug Fixes
+
+- Fix DCN initialization in DenseHead (#6625)
+- Fix initialization of ConvFCHead (#6624)
+- Fix PseudoSampler in RCNN (#6622)
+- Fix weight initialization in Swin and PVT (#6663)
+- Fix dtype bug in BaseDenseHead (#6767)
+- Fix SimOTA with no valid bbox (#6733)
+
+#### Improvements
+
+- Add an example of combining swin and one-stage models (#6621)
+- Add `get_ann_info` to dataset_wrappers (#6526)
+- Support keeping image ratio in the multi-scale training of YOLOX (#6732)
+- Support `bbox_clip_border` for the augmentations of YOLOX (#6730)
+
+#### Documents
+
+- Update metafile (#6717)
+- Add mmhuman3d in readme (#6699)
+- Update FAQ docs (#6587)
+- Add doc for `detect_anomalous_params` (#6697)
+
+#### Contributors
+
+A total of 11 developers contributed to this release.
+Thanks @ZwwWayne, @LJoson, @Czm369, @jshilong, @ZCMax, @RangiLyu, @BIGWangYuDong, @hhaAndroid, @zhaoxin111, @GT9505, @shinya7y
+
+### v2.19.0 (29/11/2021)
+
+#### Highlights
+
+- Support [Label Assignment Distillation](https://arxiv.org/abs/2108.10520)
+- Support `persistent_workers` for Pytorch >= 1.7
+- Align accuracy to the updated official YOLOX
+
+#### New Features
+
+- Support [Label Assignment Distillation](https://arxiv.org/abs/2108.10520) (#6342)
+- Support `persistent_workers` for Pytorch >= 1.7 (#6435)
+
+#### Bug Fixes
+
+- Fix repeatedly output warning message (#6584)
+- Avoid infinite GPU waiting in dist training (#6501)
+- Fix SSD512 config error (#6574)
+- Fix MMDetection model to ONNX command (#6558)
+
+#### Improvements
+
+- Refactor configs of FP16 models (#6592)
+- Align accuracy to the updated official YOLOX (#6443)
+- Speed up training and reduce memory cost when using PhotoMetricDistortion. (#6442)
+- Make OHEM work with seesaw loss (#6514)
+
+#### Documents
+
+- Update README.md (#6567)
+
+#### Contributors
+
+A total of 11 developers contributed to this release.
+Thanks @FloydHsiu, @RangiLyu, @ZwwWayne, @AndreaPi, @st9007a, @hachreak, @BIGWangYuDong, @hhaAndroid, @AronLin, @chhluo, @vealocia, @HarborYuan, @st9007a, @jshilong
+
+### v2.18.1 (15/11/2021)
+
+#### Highlights
+
+- Release [QueryInst](http://arxiv.org/abs/2105.01928) pre-trained weights (#6460)
+- Support plot confusion matrix (#6344)
+
+#### New Features
+
+- Release [QueryInst](http://arxiv.org/abs/2105.01928) pre-trained weights (#6460)
+- Support plot confusion matrix (#6344)
+
+#### Bug Fixes
+
+- Fix aug test error when the number of prediction bboxes is 0 (#6398)
+- Fix SpatialReductionAttention in PVT (#6488)
+- Fix wrong use of `trunc_normal_init` in PVT and Swin-Transformer (#6432)
+
+#### Improvements
+
+- Save the printed AP information of COCO API to logger (#6505)
+- Always map location to cpu when load checkpoint (#6405)
+- Set a random seed when the user does not set a seed (#6457)
+
+#### Documents
+
+- Chinese version of [Corruption Benchmarking](robustness_benchmarking.md) (#6375)
+- Fix config path in docs (#6396)
+- Update GRoIE readme (#6401)
+
+#### Contributors
+
+A total of 11 developers contributed to this release.
+Thanks @st9007a, @hachreak, @HarborYuan, @vealocia, @chhluo, @AndreaPi, @AronLin, @BIGWangYuDong, @hhaAndroid, @RangiLyu, @ZwwWayne
+
+### v2.18.0 (27/10/2021)
+
+#### Highlights
+
+- Support [QueryInst](http://arxiv.org/abs/2105.01928) (#6050)
+- Refactor dense heads to decouple onnx export logics from `get_bboxes` and speed up inference (#5317, #6003, #6369, #6268, #6315)
+
+#### New Features
+
+- Support [QueryInst](http://arxiv.org/abs/2105.01928) (#6050)
+- Support infinite sampler (#5996)
+
+#### Bug Fixes
+
+- Fix init_weight in fcn_mask_head (#6378)
+- Fix type error in imshow_bboxes of RPN (#6386)
+- Fix broken colab link in MMDetection Tutorial (#6382)
+- Make sure the device and dtype of scale_factor are the same as bboxes (#6374)
+- Remove sampling hardcode (#6317)
+- Fix RandomAffine bbox coordinate recorrection (#6293)
+- Fix init bug of final cls/reg layer in convfc head (#6279)
+- Fix img_shape broken in auto_augment (#6259)
+- Fix kwargs parameter missing error in two_stage (#6256)
+
+#### Improvements
+
+- Unify the interface of stuff head and panoptic head (#6308)
+- Polish readme (#6243)
+- Add code-spell pre-commit hook and fix a typo (#6306)
+- Fix typo (#6245, #6190)
+- Fix sampler unit test (#6284)
+- Fix `forward_dummy` of YOLACT to enable `get_flops` (#6079)
+- Fix link error in the config documentation (#6252)
+- Adjust the order to beautify the document (#6195)
+
+#### Refactors
+
+- Refactor one-stage get_bboxes logic (#5317)
+- Refactor ONNX export of One-Stage models (#6003, #6369)
+- Refactor dense_head and speedup (#6268)
+- Migrate to use prior_generator in training of dense heads (#6315)
+
+#### Contributors
+
+A total of 18 developers contributed to this release.
+Thanks @Boyden, @onnkeat, @st9007a, @vealocia, @yhcao6, @DapangpangX, @yellowdolphin, @cclauss, @kennymckormick,
+@pingguokiller, @collinzrj, @AndreaPi, @AronLin, @BIGWangYuDong, @hhaAndroid, @jshilong, @RangiLyu, @ZwwWayne
+
+### v2.17.0 (28/9/2021)
+
+#### Highlights
+
+- Support [PVT](https://arxiv.org/abs/2102.12122) and [PVTv2](https://arxiv.org/abs/2106.13797)
+- Support [SOLO](https://arxiv.org/abs/1912.04488)
+- Support large scale jittering and New Mask R-CNN baselines
+- Speed up `YOLOv3` inference
+
+#### New Features
+
+- Support [PVT](https://arxiv.org/abs/2102.12122) and [PVTv2](https://arxiv.org/abs/2106.13797) (#5780)
+- Support [SOLO](https://arxiv.org/abs/1912.04488) (#5832)
+- Support large scale jittering and New Mask R-CNN baselines (#6132)
+- Add a general data structure for the results of models (#5508)
+- Added a base class for one-stage instance segmentation (#5904)
+- Speed up `YOLOv3` inference (#5991)
+- Release Swin Transformer pre-trained models (#6100)
+- Support mixed precision training in `YOLOX` (#5983)
+- Support `val` workflow in `YOLACT` (#5986)
+- Add script to test `torchserve` (#5936)
+- Support `onnxsim` with dynamic input shape (#6117)
+
+#### Bug Fixes
+
+- Fix the function naming errors in `model_wrappers` (#5975)
+- Fix regression loss bug when the input is an empty tensor (#5976)
+- Fix scores not contiguous error in `centernet_head` (#6016)
+- Fix missing parameters bug in `imshow_bboxes` (#6034)
+- Fix bug in `aug_test` of `HTC` when the length of `det_bboxes` is 0 (#6088)
+- Fix empty proposal errors in the training of some two-stage models (#5941)
+- Fix `dynamic_axes` parameter error in `ONNX` dynamic shape export (#6104)
+- Fix `dynamic_shape` bug of `SyncRandomSizeHook` (#6144)
+- Fix the Swin Transformer config link error in the configuration (#6172)
+
+#### Improvements
+
+- Add filter rules in `Mosaic` transform (#5897)
+- Add size divisor in get flops to avoid some potential bugs (#6076)
+- Add Chinese translation of `docs_zh-CN/tutorials/customize_dataset.md` (#5915)
+- Add Chinese translation of `conventions.md` (#5825)
+- Add description of the output of data pipeline (#5886)
+- Add dataset information in the README file for `PanopticFPN` (#5996)
+- Add `extra_repr` for `DropBlock` layer to get details in the model printing (#6140)
+- Fix CI out of memory and add PyTorch1.9 Python3.9 unit tests (#5862)
+- Fix download links error of some model (#6069)
+- Improve the generalization of XML dataset (#5943)
+- Polish assertion error messages (#6017)
+- Remove `opencv-python-headless` dependency by `albumentations` (#5868)
+- Check dtype in transform unit tests (#5969)
+- Replace the default theme of documentation with PyTorch Sphinx Theme (#6146)
+- Update the paper and code fields in the metafile (#6043)
+- Support to customize padding value of segmentation map (#6152)
+- Support to resize multiple segmentation maps (#5747)
+
+#### Contributors
+
+A total of 24 developers contributed to this release.
+Thanks @morkovka1337, @HarborYuan, @guillaumefrd, @guigarfr, @www516717402, @gaotongxiao, @ypwhs, @MartaYang, @shinya7y, @justiceeem, @zhaojinjian0000, @VVsssssk, @aravind-anantha, @wangbo-zhao, @czczup, @whai362, @czczup, @marijnl, @AronLin, @BIGWangYuDong, @hhaAndroid, @jshilong, @RangiLyu, @ZwwWayne
+
+### v2.16.0 (30/8/2021)
+
+#### Highlights
+
+- Support [Panoptic FPN](https://arxiv.org/abs/1901.02446) and [Swin Transformer](https://arxiv.org/abs/2103.14030)
+
+#### New Features
+
+- Support [Panoptic FPN](https://arxiv.org/abs/1901.02446) and release models (#5577, #5902)
+- Support Swin Transformer backbone (#5748)
+- Release RetinaNet models pre-trained with multi-scale 3x schedule (#5636)
+- Add script to convert unlabeled image list to coco format (#5643)
+- Add hook to check whether the loss value is valid (#5674)
+- Add YOLO anchor optimizing tool (#5644)
+- Support export onnx models without post process. (#5851)
+- Support classwise evaluation in CocoPanopticDataset (#5896)
+- Adapt browse_dataset for concatenated datasets. (#5935)
+- Add `PatchEmbed` and `PatchMerging` with `AdaptivePadding` (#5952)
+
+#### Bug Fixes
+
+- Fix unit tests of YOLOX (#5859)
+- Fix lose randomness in `imshow_det_bboxes` (#5845)
+- Make output result of `ImageToTensor` contiguous (#5756)
+- Fix inference bug when calling `regress_by_class` in RoIHead in some cases (#5884)
+- Fix bug in CIoU loss where alpha should not have gradient. (#5835)
+- Fix the bug that `multiscale_output` is defined but not used in HRNet (#5887)
+- Set the priority of EvalHook to LOW. (#5882)
+- Fix a YOLOX bug when applying bbox rescaling in test mode (#5899)
+- Fix mosaic coordinate error (#5947)
+- Fix dtype of bbox in RandomAffine. (#5930)
+
+#### Improvements
+
+- Add Chinese version of `data_pipeline` and  (#5662)
+- Support to remove state dicts of EMA when publishing models. (#5858)
+- Refactor the loss function in HTC and SCNet (#5881)
+- Use warnings instead of logger.warning (#5540)
+- Use legacy coordinate in metric of VOC (#5627)
+- Add Chinese version of customize_losses (#5826)
+- Add Chinese version of model_zoo (#5827)
+
+#### Contributors
+
+A total of 19 developers contributed to this release.
+Thanks @ypwhs, @zywvvd, @collinzrj, @OceanPang, @ddonatien, @@haotian-liu, @viibridges, @Muyun99, @guigarfr, @zhaojinjian0000, @jbwang1997,@wangbo-zhao, @xvjiarui, @RangiLyu, @jshilong, @AronLin, @BIGWangYuDong, @hhaAndroid, @ZwwWayne
+
+### v2.15.1 (11/8/2021)
+
+#### Highlights
+
+- Support [YOLOX](https://arxiv.org/abs/2107.08430)
+
+#### New Features
+
+- Support [YOLOX](https://arxiv.org/abs/2107.08430)(#5756, #5758, #5760, #5767, #5770, #5774, #5777, #5808, #5828, #5848)
+
+#### Bug Fixes
+
+- Update correct SSD models. (#5789)
+- Fix casting error in mask structure (#5820)
+- Fix MMCV deployment documentation links. (#5790)
+
+#### Improvements
+
+- Use dynamic MMCV download link in TorchServe dockerfile (#5779)
+- Rename the function `upsample_like` to `interpolate_as` for more general usage (#5788)
+
+#### Contributors
+
+A total of 14 developers contributed to this release.
+Thanks @HAOCHENYE, @xiaohu2015, @HsLOL, @zhiqwang, @Adamdad, @shinya7y, @Johnson-Wang, @RangiLyu, @jshilong, @mmeendez8, @AronLin, @BIGWangYuDong, @hhaAndroid, @ZwwWayne
+
+### v2.15.0 (02/8/2021)
+
+#### Highlights
+
+- Support adding [MIM](https://github.com/open-mmlab/mim) dependencies during pip installation
+- Support MobileNetV2 for SSD-Lite and YOLOv3
+- Support Chinese Documentation
+
+#### New Features
+
+- Add function `upsample_like` (#5732)
+- Support to output pdf and epub format documentation (#5738)
+- Support and release Cascade Mask R-CNN 3x pre-trained models (#5645)
+- Add `ignore_index` to CrossEntropyLoss (#5646)
+- Support adding [MIM](https://github.com/open-mmlab/mim) dependencies during pip installation (#5676)
+- Add MobileNetV2 config and models for YOLOv3 (#5510)
+- Support COCO Panoptic Dataset (#5231)
+- Support ONNX export of cascade models (#5486)
+- Support DropBlock with RetinaNet (#5544)
+- Support MobileNetV2 SSD-Lite (#5526)
+
+#### Bug Fixes
+
+- Fix the device of label in multiclass_nms (#5673)
+- Fix error of backbone initialization from pre-trained checkpoint in config file (#5603, #5550)
+- Fix download links of RegNet pretrained weights (#5655)
+- Fix two-stage runtime error given empty proposal (#5559)
+- Fix flops count error in DETR (#5654)
+- Fix unittest for `NumClassCheckHook` when it is not used. (#5626)
+- Fix description bug of using custom dataset (#5546)
+- Fix bug of `multiclass_nms` that returns the global indices (#5592)
+- Fix `valid_mask` logic error in RPNHead (#5562)
+- Fix unit test error of pretrained configs (#5561)
+- Fix typo error in anchor_head.py (#5555)
+- Fix bug when using dataset wrappers (#5552)
+- Fix a typo error in demo/MMDet_Tutorial.ipynb (#5511)
+- Fixing crash in `get_root_logger` when `cfg.log_level` is not None (#5521)
+- Fix docker version (#5502)
+- Fix optimizer parameter error when using `IterBasedRunner` (#5490)
+
+#### Improvements
+
+- Add unit tests for MMTracking (#5620)
+- Add Chinese translation of documentation (#5718, #5618, #5558, #5423, #5593, #5421, #5408. #5369, #5419, #5530, #5531)
+- Update resource limit (#5697)
+- Update docstring for InstaBoost (#5640)
+- Support key `reduction_override` in all loss functions (#5515)
+- Use repeatdataset to accelerate CenterNet training (#5509)
+- Remove unnecessary code in autoassign (#5519)
+- Add documentation about `init_cfg` (#5273)
+
+#### Contributors
+
+A total of 18 developers contributed to this release.
+Thanks @OceanPang, @AronLin, @hellock, @Outsider565, @RangiLyu, @ElectronicElephant, @likyoo, @BIGWangYuDong, @hhaAndroid, @noobying, @yyz561, @likyoo,
+@zeakey, @ZwwWayne, @ChenyangLiu, @johnson-magic, @qingswu, @BuxianChen
+
+### v2.14.0 (29/6/2021)
+
+#### Highlights
+
+- Add `simple_test` to dense heads to improve the consistency of single-stage and two-stage detectors
+- Revert the `test_mixins` to single image test to improve efficiency and readability
+- Add Faster R-CNN and Mask R-CNN config using multi-scale training with 3x schedule
+
+#### New Features
+
+- Support pretrained models from MoCo v2 and SwAV (#5286)
+- Add Faster R-CNN and Mask R-CNN config using multi-scale training with 3x schedule (#5179, #5233)
+- Add `reduction_override` in MSELoss (#5437)
+- Stable support of exporting DETR to ONNX with dynamic shapes and batch inference (#5168)
+- Stable support of exporting PointRend to ONNX with dynamic shapes and batch inference (#5440)
+
+#### Bug Fixes
+
+- Fix size mismatch bug in `multiclass_nms` (#4980)
+- Fix the import path of `MultiScaleDeformableAttention` (#5338)
+- Fix errors in config of GCNet ResNext101 models (#5360)
+- Fix Grid-RCNN error when there is no bbox result (#5357)
+- Fix errors in `onnx_export` of bbox_head when setting reg_class_agnostic (#5468)
+- Fix type error of AutoAssign in the document (#5478)
+- Fix web links ending with `.md` (#5315)
+
+#### Improvements
+
+- Add `simple_test` to dense heads to improve the consistency of single-stage and two-stage detectors (#5264)
+- Add support for mask diagonal flip in TTA (#5403)
+- Revert the `test_mixins` to single image test to improve efficiency and readability (#5249)
+- Make YOLOv3 Neck more flexible (#5218)
+- Refactor SSD to make it more general (#5291)
+- Refactor `anchor_generator` and `point_generator` (#5349)
+- Allow to configure out the `mask_head` of the HTC algorithm (#5389)
+- Delete deprecated warning in FPN (#5311)
+- Move `model.pretrained` to `model.backbone.init_cfg` (#5370)
+- Make deployment tools more friendly to use (#5280)
+- Clarify installation documentation (#5316)
+- Add ImageNet Pretrained Models docs (#5268)
+- Add FAQ about training loss=nan solution and COCO AP or AR =-1 (# 5312, #5313)
+- Change all weight links of http to https (#5328)
+
+### v2.13.0 (01/6/2021)
+
+#### Highlights
+
+- Support new methods: [CenterNet](https://arxiv.org/abs/1904.07850), [Seesaw Loss](https://arxiv.org/abs/2008.10032), [MobileNetV2](https://arxiv.org/abs/1801.04381)
+
+#### New Features
+
+- Support paper [Objects as Points](https://arxiv.org/abs/1904.07850) (#4602)
+- Support paper [Seesaw Loss for Long-Tailed Instance Segmentation (CVPR 2021)](https://arxiv.org/abs/2008.10032) (#5128)
+- Support [MobileNetV2](https://arxiv.org/abs/1801.04381) backbone and inverted residual block (#5122)
+- Support [MIM](https://github.com/open-mmlab/mim) (#5143)
+- ONNX exportation with dynamic shapes of CornerNet (#5136)
+- Add `mask_soft` config option to allow non-binary masks (#4615)
+- Add PWC metafile (#5135)
+
+#### Bug Fixes
+
+- Fix YOLOv3 FP16 training error (#5172)
+- Fix Cacscade R-CNN TTA test error when `det_bboxes` length is 0  (#5221)
+- Fix `iou_thr` variable naming errors in VOC recall calculation function (#5195)
+- Fix Faster R-CNN performance dropped in ONNX Runtime (#5197)
+- Fix DETR dict changed error when using python 3.8 during iteration  (#5226)
+
+#### Improvements
+
+- Refactor ONNX export of two stage detector (#5205)
+- Replace MMDetection's EvalHook with MMCV's EvalHook for consistency  (#4806)
+- Update RoI extractor for ONNX (#5194)
+- Use better parameter initialization in YOLOv3 head for higher performance (#5181)
+- Release new DCN models of Mask R-CNN by mixed-precision training (#5201)
+- Update YOLOv3 model weights (#5229)
+- Add DetectoRS ResNet-101 model weights (#4960)
+- Discard bboxes with sizes equals to `min_bbox_size` (#5011)
+- Remove duplicated code in DETR head (#5129)
+- Remove unnecessary object in class definition (#5180)
+- Fix doc link (#5192)
+
+### v2.12.0 (01/5/2021)
+
+#### Highlights
+
+- Support new methods: [AutoAssign](https://arxiv.org/abs/2007.03496), [YOLOF](https://arxiv.org/abs/2103.09460), and [Deformable DETR](https://arxiv.org/abs/2010.04159)
+- Stable support of exporting models to ONNX with batched images and dynamic shape (#5039)
+
+#### Backwards Incompatible Changes
+
+MMDetection is going through big refactoring for more general and convenient usages during the releases from v2.12.0 to v2.15.0 (maybe longer).
+In v2.12.0 MMDetection inevitably brings some BC-breakings, including the MMCV dependency, model initialization, model registry, and mask AP evaluation.
+
+- MMCV version. MMDetection v2.12.0 relies on the newest features in MMCV 1.3.3, including `BaseModule` for unified parameter initialization, model registry, and the CUDA operator `MultiScaleDeformableAttn` for [Deformable DETR](https://arxiv.org/abs/2010.04159). Note that MMCV 1.3.2 already contains all the features used by MMDet but has known issues. Therefore, we recommend users skip MMCV v1.3.2 and use v1.3.3, though v1.3.2 might work for most cases.
+- Unified model initialization (#4750). To unify the parameter initialization in OpenMMLab projects, MMCV supports `BaseModule` that accepts `init_cfg` to allow the modules' parameters initialized in a flexible and unified manner. Now the users need to explicitly call `model.init_weights()` in the training script to initialize the model (as in [here](https://github.com/open-mmlab/mmdetection/blob/master/tools/train.py#L162), previously this was handled by the detector. The models in MMDetection have been re-benchmarked to ensure accuracy based on PR #4750. __The downstream projects should update their code accordingly to use MMDetection v2.12.0__.
+- Unified model registry (#5059). To easily use backbones implemented in other OpenMMLab projects, MMDetection migrates to inherit the model registry created in MMCV (#760). In this way, as long as the backbone is supported in an OpenMMLab project and that project also uses the registry in MMCV, users can use that backbone in MMDetection by simply modifying the config without copying the code of that backbone into MMDetection.
+- Mask AP evaluation (#4898). Previous versions calculate the areas of masks through the bounding boxes when calculating the mask AP of small, medium, and large instances. To indeed use the areas of masks, we pop the key `bbox` during mask AP calculation. This change does not affect the overall mask AP evaluation and aligns the mask AP of similar models in other projects like Detectron2.
+
+#### New Features
+
+- Support paper [AutoAssign: Differentiable Label Assignment for Dense Object Detection](https://arxiv.org/abs/2007.03496) (#4295)
+- Support paper [You Only Look One-level Feature](https://arxiv.org/abs/2103.09460) (#4295)
+- Support paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) (#4778)
+- Support calculating IoU with FP16 tensor in `bbox_overlaps` to save memory and keep speed (#4889)
+- Add `__repr__` in custom dataset to count the number of instances (#4756)
+- Add windows support by updating requirements.txt (#5052)
+- Stable support of exporting models to ONNX with batched images and dynamic shape, including SSD, FSAF,FCOS, YOLOv3, RetinaNet, Faster R-CNN, and Mask R-CNN (#5039)
+
+#### Improvements
+
+- Use MMCV `MODEL_REGISTRY` (#5059)
+- Unified parameter initialization for more flexible usage (#4750)
+- Rename variable names and fix docstring in anchor head (#4883)
+- Support training with empty GT in Cascade RPN (#4928)
+- Add more details of usage of `test_robustness` in documentation (#4917)
+- Changing to use `pycocotools` instead of `mmpycocotools` to fully support Detectron2 and MMDetection in one environment (#4939)
+- Update torch serve dockerfile to support dockers of more versions (#4954)
+- Add check for training with single class dataset (#4973)
+- Refactor transformer and DETR Head (#4763)
+- Update FPG model zoo (#5079)
+- More accurate mask AP of small/medium/large instances (#4898)
+
+#### Bug Fixes
+
+- Fix bug in mean_ap.py when calculating mAP by 11 points (#4875)
+- Fix error when key `meta` is not in old checkpoints (#4936)
+- Fix hanging bug when training with empty GT in VFNet, GFL, and FCOS by changing the place of `reduce_mean` (#4923, #4978, #5058)
+- Fix asyncronized inference error and provide related demo (#4941)
+- Fix IoU losses dimensionality unmatch error (#4982)
+- Fix torch.randperm whtn using PyTorch 1.8 (#5014)
+- Fix empty bbox error in `mask_head` when using CARAFE (#5062)
+- Fix `supplement_mask` bug when there are zero-size RoIs (#5065)
+- Fix testing with empty rois in RoI Heads (#5081)
+
+### v2.11.0 (01/4/2021)
+
+__Highlights__
+
+- Support new method: [Localization Distillation for Object Detection](https://arxiv.org/pdf/2102.12252.pdf)
+- Support Pytorch2ONNX with batch inference and dynamic shape
+
+__New Features__
+
+- Support [Localization Distillation for Object Detection](https://arxiv.org/pdf/2102.12252.pdf) (#4758)
+- Support Pytorch2ONNX with batch inference and dynamic shape for Faster-RCNN and mainstream one-stage detectors (#4796)
+
+__Improvements__
+
+- Support batch inference in head of RetinaNet (#4699)
+- Add batch dimension in second stage of Faster-RCNN (#4785)
+- Support batch inference in bbox coder (#4721)
+- Add check for `ann_ids` in `COCODataset` to ensure it is unique (#4789)
+- support for showing the FPN results (#4716)
+- support dynamic shape for grid_anchor (#4684)
+- Move pycocotools version check to when it is used (#4880)
+
+__Bug Fixes__
+
+- Fix a bug of TridentNet when doing the batch inference (#4717)
+- Fix a bug of Pytorch2ONNX in FASF (#4735)
+- Fix a bug when show the image with float type (#4732)
+
+### v2.10.0 (01/03/2021)
+
+#### Highlights
+
+- Support new methods: [FPG](https://arxiv.org/abs/2004.03580)
+- Support ONNX2TensorRT for SSD, FSAF, FCOS, YOLOv3, and Faster R-CNN.
+
+#### New Features
+
+- Support ONNX2TensorRT for SSD, FSAF, FCOS, YOLOv3, and Faster R-CNN (#4569)
+- Support [Feature Pyramid Grids (FPG)](https://arxiv.org/abs/2004.03580) (#4645)
+- Support video demo (#4420)
+- Add seed option for sampler (#4665)
+- Support to customize type of runner (#4570, #4669)
+- Support synchronizing BN buffer in `EvalHook` (#4582)
+- Add script for GIF demo (#4573)
+
+#### Bug Fixes
+
+- Fix ConfigDict AttributeError and add Colab link (#4643)
+- Avoid crash in empty gt training of GFL head (#4631)
+- Fix `iou_thrs` bug in RPN evaluation (#4581)
+- Fix syntax error of config when upgrading model version (#4584)
+
+#### Improvements
+
+- Refactor unit test file structures (#4600)
+- Refactor nms config (#4636)
+- Get loading pipeline by checking the class directly rather than through config strings (#4619)
+- Add doctests for mask target generation and mask structures (#4614)
+- Use deep copy when copying pipeline arguments (#4621)
+- Update documentations (#4642, #4650, #4620, #4630)
+- Remove redundant code calling `import_modules_from_strings` (#4601)
+- Clean deprecated FP16 API (#4571)
+- Check whether `CLASSES` is correctly initialized in the initialization of `XMLDataset` (#4555)
+- Support batch inference in the inference API (#4462, #4526)
+- Clean deprecated warning and fix 'meta' error (#4695)
+
+### v2.9.0 (01/02/2021)
+
+#### Highlights
+
+- Support new methods: [SCNet](https://arxiv.org/abs/2012.10150), [Sparse R-CNN](https://arxiv.org/abs/2011.12450)
+- Move `train_cfg` and `test_cfg` into model in configs
+- Support to visualize results based on prediction quality
+
+#### New Features
+
+- Support [SCNet](https://arxiv.org/abs/2012.10150) (#4356)
+- Support [Sparse R-CNN](https://arxiv.org/abs/2011.12450) (#4219)
+- Support evaluate mAP by multiple IoUs (#4398)
+- Support concatenate dataset for testing (#4452)
+- Support to visualize results based on prediction quality (#4441)
+- Add ONNX simplify option to Pytorch2ONNX script (#4468)
+- Add hook for checking compatibility of class numbers in heads and datasets (#4508)
+
+#### Bug Fixes
+
+- Fix CPU inference bug of Cascade RPN (#4410)
+- Fix NMS error of CornerNet when there is no prediction box (#4409)
+- Fix TypeError in CornerNet inference (#4411)
+- Fix bug of PAA when training with background images (#4391)
+- Fix the error that the window data is not destroyed when `out_file is not None` and `show==False` (#4442)
+- Fix order of NMS `score_factor` that will decrease the performance of YOLOv3 (#4473)
+- Fix bug in HTC TTA when the number of detection boxes is 0 (#4516)
+- Fix resize error in mask data structures (#4520)
+
+#### Improvements
+
+- Allow to customize classes in LVIS dataset (#4382)
+- Add tutorials for building new models with existing datasets (#4396)
+- Add CPU compatibility information in documentation (#4405)
+- Add documentation of deprecated `ImageToTensor` for batch inference (#4408)
+- Add more details in documentation for customizing dataset (#4430)
+- Switch `imshow_det_bboxes` visualization backend from OpenCV to Matplotlib (#4389)
+- Deprecate `ImageToTensor` in `image_demo.py` (#4400)
+- Move train_cfg/test_cfg into model (#4347, #4489)
+- Update docstring for `reg_decoded_bbox` option in bbox heads (#4467)
+- Update dataset information in documentation (#4525)
+- Release pre-trained R50 and R101 PAA detectors with multi-scale 3x training schedules (#4495)
+- Add guidance for speed benchmark (#4537)
+
+### v2.8.0 (04/01/2021)
+
+#### Highlights
+
+- Support new methods: [Cascade RPN](https://arxiv.org/abs/1909.06720), [TridentNet](https://arxiv.org/abs/1901.01892)
+
+#### New Features
+
+- Support [Cascade RPN](https://arxiv.org/abs/1909.06720) (#1900)
+- Support [TridentNet](https://arxiv.org/abs/1901.01892) (#3313)
+
+#### Bug Fixes
+
+- Fix bug of show result in async_benchmark (#4367)
+- Fix scale factor in MaskTestMixin (#4366)
+- Fix but when returning indices in `multiclass_nms` (#4362)
+- Fix bug of empirical attention in resnext backbone error (#4300)
+- Fix bug of `img_norm_cfg` in FCOS-HRNet models with updated performance and models (#4250)
+- Fix invalid checkpoint and log in Mask R-CNN models on Cityscapes dataset (#4287)
+- Fix bug in distributed sampler when dataset is too small (#4257)
+- Fix bug of 'PAFPN has no attribute extra_convs_on_inputs' (#4235)
+
+#### Improvements
+
+- Update model url from aws to aliyun (#4349)
+- Update ATSS for PyTorch 1.6+ (#4359)
+- Update script to install ruby in pre-commit installation (#4360)
+- Delete deprecated `mmdet.ops` (#4325)
+- Refactor hungarian assigner for more general usage in Sparse R-CNN (#4259)
+- Handle scipy import in DETR to reduce package dependencies (#4339)
+- Update documentation of usages for config options after MMCV (1.2.3) supports overriding list in config (#4326)
+- Update pre-train models of faster rcnn trained on COCO subsets (#4307)
+- Avoid zero or too small value for beta in Dynamic R-CNN (#4303)
+- Add doccumentation for Pytorch2ONNX (#4271)
+- Add deprecated warning FPN arguments (#4264)
+- Support returning indices of kept bboxes when using nms (#4251)
+- Update type and device requirements when creating tensors `GFLHead` (#4210)
+- Update device requirements when creating tensors in `CrossEntropyLoss` (#4224)
+
+### v2.7.0 (30/11/2020)
+
+- Support new method: [DETR](https://arxiv.org/abs/2005.12872), [ResNest](https://arxiv.org/abs/2004.08955), Faster R-CNN DC5.
+- Support YOLO, Mask R-CNN, and Cascade R-CNN models exportable to ONNX.
+
+#### New Features
+
+- Support [DETR](https://arxiv.org/abs/2005.12872) (#4201, #4206)
+- Support to link the best checkpoint in training (#3773)
+- Support to override config through options in inference.py (#4175)
+- Support YOLO, Mask R-CNN, and Cascade R-CNN models exportable to ONNX (#4087, #4083)
+- Support [ResNeSt](https://arxiv.org/abs/2004.08955) backbone (#2959)
+- Support unclip border bbox regression (#4076)
+- Add tpfp func in evaluating AP (#4069)
+- Support mixed precision training of SSD detector with other backbones (#4081)
+- Add Faster R-CNN DC5 models (#4043)
+
+#### Bug Fixes
+
+- Fix bug of `gpu_id` in distributed training mode (#4163)
+- Support Albumentations with version higher than 0.5 (#4032)
+- Fix num_classes bug in faster rcnn config (#4088)
+- Update code in docs/2_new_data_model.md (#4041)
+
+#### Improvements
+
+- Ensure DCN offset to have similar type as features in VFNet (#4198)
+- Add config links in README files of models (#4190)
+- Add tutorials for loss conventions (#3818)
+- Add solution to installation issues in 30-series GPUs (#4176)
+- Update docker version in get_started.md (#4145)
+- Add model statistics and polish some titles in configs README (#4140)
+- Clamp neg probability in FreeAnchor (#4082)
+- Speed up expanding large images (#4089)
+- Fix Pytorch 1.7 incompatibility issues (#4103)
+- Update trouble shooting page to resolve segmentation fault (#4055)
+- Update aLRP-Loss in project page (#4078)
+- Clean duplicated `reduce_mean` function (#4056)
+- Refactor Q&A (#4045)
+
+### v2.6.0 (1/11/2020)
+
+- Support new method: [VarifocalNet](https://arxiv.org/abs/2008.13367).
+- Refactored documentation with more tutorials.
+
+#### New Features
+
+- Support GIoU calculation in `BboxOverlaps2D`, and re-implement `giou_loss` using `bbox_overlaps` (#3936)
+- Support random sampling in CPU mode (#3948)
+- Support VarifocalNet (#3666, #4024)
+
+#### Bug Fixes
+
+- Fix SABL validating bug in Cascade R-CNN (#3913)
+- Avoid division by zero in PAA head when num_pos=0 (#3938)
+- Fix temporary directory bug of multi-node testing error (#4034, #4017)
+- Fix `--show-dir` option in test script (#4025)
+- Fix GA-RetinaNet r50 model url (#3983)
+- Update code in docs and fix broken urls (#3947)
+
+#### Improvements
+
+- Refactor pytorch2onnx API into `mmdet.core.export` and use `generate_inputs_and_wrap_model` for pytorch2onnx (#3857, #3912)
+- Update RPN upgrade scripts for v2.5.0 compatibility (#3986)
+- Use mmcv `tensor2imgs` (#4010)
+- Update test robustness (#4000)
+- Update trouble shooting page (#3994)
+- Accelerate PAA training speed (#3985)
+- Support batch_size > 1 in validation (#3966)
+- Use RoIAlign implemented in MMCV for inference in CPU mode (#3930)
+- Documentation refactoring (#4031)
+
+### v2.5.0 (5/10/2020)
+
+#### Highlights
+
+- Support new methods: [YOLACT](https://arxiv.org/abs/1904.02689), [CentripetalNet](https://arxiv.org/abs/2003.09119).
+- Add more documentations for easier and more clear usage.
+
+#### Backwards Incompatible Changes
+
+__FP16 related methods are imported from mmcv instead of mmdet. (#3766, #3822)__
+Mixed precision training utils in `mmdet.core.fp16` are moved to `mmcv.runner`, including `force_fp32`, `auto_fp16`, `wrap_fp16_model`, and `Fp16OptimizerHook`. A deprecation warning will be raised if users attempt to import those methods from `mmdet.core.fp16`, and will be finally removed in V2.10.0.
+
+__\[0, N-1\] represents foreground classes and N indicates background classes for all models. (#3221)__
+Before v2.5.0, the background label for RPN is 0, and N for other heads. Now the behavior is consistent for all models. Thus `self.background_labels` in `dense_heads` is removed and all heads use `self.num_classes` to indicate the class index of background labels.
+This change has no effect on the pre-trained models in the v2.x model zoo, but will affect the training of all models with RPN heads. Two-stage detectors whose RPN head uses softmax will be affected because the order of categories is changed.
+
+**Only call `get_subset_by_classes` when `test_mode=True` and `self.filter_empty_gt=True` (#3695)**
+Function `get_subset_by_classes` in dataset is refactored and only filters out images when `test_mode=True` and `self.filter_empty_gt=True`.
+In the original implementation, `get_subset_by_classes` is not related to the flag `self.filter_empty_gt` and will only be called when the classes is set during initialization no matter `test_mode` is `True` or `False`. This brings ambiguous behavior and potential bugs in many cases. After v2.5.0, if `filter_empty_gt=False`, no matter whether the classes are specified in a dataset, the dataset will use all the images in the annotations. If `filter_empty_gt=True` and `test_mode=True`, no matter whether the classes are specified, the dataset will call \`\`get_subset_by_classes\` to check the images and filter out images containing no GT boxes. Therefore, the users should be responsible for the data filtering/cleaning process for the test dataset.
+
+#### New Features
+
+- Test time augmentation for single stage detectors (#3844, #3638)
+- Support to show the name of experiments during training (#3764)
+- Add `Shear`, `Rotate`, `Translate` Augmentation (#3656, #3619, #3687)
+- Add image-only transformations including `Constrast`, `Equalize`, `Color`, and `Brightness`. (#3643)
+- Support [YOLACT](https://arxiv.org/abs/1904.02689) (#3456)
+- Support [CentripetalNet](https://arxiv.org/abs/2003.09119) (#3390)
+- Support PyTorch 1.6 in docker (#3905)
+
+#### Bug Fixes
+
+- Fix the bug of training ATSS when there is no ground truth boxes (#3702)
+- Fix the bug of using Focal Loss when there is `num_pos` is 0 (#3702)
+- Fix the label index mapping in dataset browser (#3708)
+- Fix Mask R-CNN training stuck problem when their is no positive rois (#3713)
+- Fix the bug of `self.rpn_head.test_cfg` in `RPNTestMixin` by using `self.rpn_head` in rpn head (#3808)
+- Fix deprecated `Conv2d` from mmcv.ops (#3791)
+- Fix device bug in RepPoints (#3836)
+- Fix SABL validating bug (#3849)
+- Use `https://download.openmmlab.com/mmcv/dist/index.html` for installing MMCV (#3840)
+- Fix nonzero in NMS for PyTorch 1.6.0 (#3867)
+- Fix the API change bug of PAA (#3883)
+- Fix typo in bbox_flip (#3886)
+- Fix cv2 import error of ligGL.so.1 in Dockerfile (#3891)
+
+#### Improvements
+
+- Change to use `mmcv.utils.collect_env` for collecting environment information to avoid duplicate codes (#3779)
+- Update checkpoint file names to v2.0 models in documentation (#3795)
+- Update tutorials for changing runtime settings (#3778), modifying loss (#3777)
+- Improve the function of `simple_test_bboxes` in SABL (#3853)
+- Convert mask to bool before using it as img's index for robustness and speedup (#3870)
+- Improve documentation of modules and dataset customization (#3821)
+
+### v2.4.0 (5/9/2020)
+
+__Highlights__
+
+- Fix lots of issues/bugs and reorganize the trouble shooting page
+- Support new methods [SABL](https://arxiv.org/abs/1912.04260), [YOLOv3](https://arxiv.org/abs/1804.02767), and [PAA Assign](https://arxiv.org/abs/2007.08103)
+- Support Batch Inference
+- Start to publish `mmdet` package to PyPI since v2.3.0
+- Switch model zoo to download.openmmlab.com
+
+__Backwards Incompatible Changes__
+
+- Support Batch Inference (#3564, #3686, #3705): Since v2.4.0, MMDetection could inference model with multiple images in a single GPU.
+  This change influences all the test APIs in MMDetection and downstream codebases. To help the users migrate their code, we use `replace_ImageToTensor` (#3686) to convert legacy test data pipelines during dataset initialization.
+- Support RandomFlip with horizontal/vertical/diagonal direction (#3608): Since v2.4.0, MMDetection supports horizontal/vertical/diagonal flip in the data augmentation. This influences bounding box, mask, and image transformations in data augmentation process and the process that will map those data back to the original format.
+- Migrate to use `mmlvis` and `mmpycocotools` for COCO and LVIS dataset (#3727). The APIs are fully compatible with the original `lvis` and `pycocotools`. Users need to uninstall the existing pycocotools and lvis packages in their environment first and install `mmlvis` & `mmpycocotools`.
+
+__Bug Fixes__
+
+- Fix default mean/std for onnx (#3491)
+- Fix coco evaluation and add metric items (#3497)
+- Fix typo for install.md (#3516)
+- Fix atss when sampler per gpu is 1 (#3528)
+- Fix import of fuse_conv_bn (#3529)
+- Fix bug of gaussian_target, update unittest of heatmap (#3543)
+- Fixed VOC2012 evaluate (#3553)
+- Fix scale factor bug of rescale (#3566)
+- Fix with_xxx_attributes in base detector (#3567)
+- Fix boxes scaling when number is 0 (#3575)
+- Fix rfp check when neck config is a list (#3591)
+- Fix import of fuse conv bn in benchmark.py (#3606)
+- Fix webcam demo (#3634)
+- Fix typo and itemize issues in tutorial (#3658)
+- Fix error in distributed training when some levels of FPN are not assigned with bounding boxes (#3670)
+- Fix the width and height orders of stride in valid flag generation (#3685)
+- Fix weight initialization bug in Res2Net DCN (#3714)
+- Fix bug in OHEMSampler (#3677)
+
+__New Features__
+
+- Support Cutout augmentation (#3521)
+- Support evaluation on multiple datasets through ConcatDataset (#3522)
+- Support [PAA assign](https://arxiv.org/abs/2007.08103) #(3547)
+- Support eval metric with pickle results (#3607)
+- Support [YOLOv3](https://arxiv.org/abs/1804.02767) (#3083)
+- Support [SABL](https://arxiv.org/abs/1912.04260) (#3603)
+- Support to publish to Pypi in github-action (#3510)
+- Support custom imports (#3641)
+
+__Improvements__
+
+- Refactor common issues in documentation (#3530)
+- Add pytorch 1.6 to CI config (#3532)
+- Add config to runner meta (#3534)
+- Add eval-option flag for testing (#3537)
+- Add init_eval to evaluation hook (#3550)
+- Add include_bkg in ClassBalancedDataset (#3577)
+- Using config's loading in inference_detector (#3611)
+- Add ATSS ResNet-101 models in model zoo (#3639)
+- Update urls to download.openmmlab.com (#3665)
+- Support non-mask training for CocoDataset (#3711)
+
+### v2.3.0 (5/8/2020)
+
+__Highlights__
+
+- The CUDA/C++ operators have been moved to `mmcv.ops`. For backward compatibility `mmdet.ops` is kept as warppers of `mmcv.ops`.
+- Support new methods [CornerNet](https://arxiv.org/abs/1808.01244), [DIOU](https://arxiv.org/abs/1911.08287)/[CIOU](https://arxiv.org/abs/2005.03572) loss, and new dataset: [LVIS V1](https://arxiv.org/abs/1908.03195)
+- Provide more detailed colab training tutorials and more complete documentation.
+- Support to convert RetinaNet from Pytorch to ONNX.
+
+__Bug Fixes__
+
+- Fix the model initialization bug of DetectoRS (#3187)
+- Fix the bug of module names in NASFCOSHead (#3205)
+- Fix the filename bug in publish_model.py (#3237)
+- Fix the dimensionality bug when `inside_flags.any()` is `False` in dense heads (#3242)
+- Fix the bug of forgetting to pass flip directions in `MultiScaleFlipAug` (#3262)
+- Fixed the bug caused by default value of `stem_channels` (#3333)
+- Fix the bug of model checkpoint loading for CPU inference (#3318, #3316)
+- Fix topk bug when box number is smaller than the expected topk number in ATSSAssigner (#3361)
+- Fix the gt priority bug in center_region_assigner.py (#3208)
+- Fix NaN issue of iou calculation in iou_loss.py (#3394)
+- Fix the bug that `iou_thrs` is not actually used during evaluation in coco.py (#3407)
+- Fix test-time augmentation of RepPoints (#3435)
+- Fix runtimeError caused by incontiguous tensor in Res2Net+DCN (#3412)
+
+__New Features__
+
+- Support [CornerNet](https://arxiv.org/abs/1808.01244) (#3036)
+- Support [DIOU](https://arxiv.org/abs/1911.08287)/[CIOU](https://arxiv.org/abs/2005.03572) loss (#3151)
+- Support [LVIS V1](https://arxiv.org/abs/1908.03195) dataset (#)
+- Support customized hooks in training (#3395)
+- Support fp16 training of generalized focal loss (#3410)
+- Support to convert RetinaNet from Pytorch to ONNX (#3075)
+
+__Improvements__
+
+- Support to process ignore boxes in ATSS assigner (#3082)
+- Allow to crop images without ground truth in `RandomCrop` (#3153)
+- Enable the the `Accuracy` module to set threshold (#3155)
+- Refactoring unit tests (#3206)
+- Unify the training settings of `to_float32` and `norm_cfg` in RegNets configs (#3210)
+- Add colab training tutorials for beginners (#3213, #3273)
+- Move CUDA/C++ operators into `mmcv.ops` and keep `mmdet.ops` as warppers for backward compatibility (#3232)(#3457)
+- Update installation scripts in documentation (#3290) and dockerfile (#3320)
+- Support to set image resize backend (#3392)
+- Remove git hash in version file (#3466)
+- Check mmcv version to force version compatibility (#3460)
+
+### v2.2.0 (1/7/2020)
+
+__Highlights__
+
+- Support new methods: [DetectoRS](https://arxiv.org/abs/2006.02334), [PointRend](https://arxiv.org/abs/1912.08193), [Generalized Focal Loss](https://arxiv.org/abs/2006.04388), [Dynamic R-CNN](https://arxiv.org/abs/2004.06002)
+
+__Bug Fixes__
+
+- Fix FreeAnchor when no gt in image (#3176)
+- Clean up deprecated usage of `register_module()` (#3092, #3161)
+- Fix pretrain bug in NAS FCOS (#3145)
+- Fix `num_classes` in SSD (#3142)
+- Fix FCOS warmup (#3119)
+- Fix `rstrip` in `tools/publish_model.py`
+- Fix `flip_ratio` default value in RandomFLip pipeline (#3106)
+- Fix cityscapes eval with ms_rcnn (#3112)
+- Fix RPN softmax (#3056)
+- Fix filename of LVIS@v0.5 (#2998)
+- Fix nan loss by filtering out-of-frame gt_bboxes in COCO (#2999)
+- Fix bug in FSAF (#3018)
+- Add FocalLoss `num_classes` check (#2964)
+- Fix PISA Loss when there are no gts (#2992)
+- Avoid nan in `iou_calculator` (#2975)
+- Prevent possible bugs in loading and transforms caused by shallow copy (#2967)
+
+__New Features__
+
+- Add DetectoRS (#3064)
+- Support Generalize Focal Loss (#3097)
+- Support PointRend (#2752)
+- Support Dynamic R-CNN (#3040)
+- Add DeepFashion dataset (#2968)
+- Implement FCOS training tricks (#2935)
+- Use BaseDenseHead as base class for anchor-base heads (#2963)
+- Add `with_cp` for BasicBlock (#2891)
+- Add `stem_channels` argument for ResNet (#2954)
+
+__Improvements__
+
+- Add anchor free base head (#2867)
+- Migrate to github action (#3137)
+- Add docstring for datasets, pipelines, core modules and methods (#3130, #3125, #3120)
+- Add VOC benchmark (#3060)
+- Add `concat` mode in GRoI (#3098)
+- Remove cmd arg `autorescale-lr` (#3080)
+- Use `len(data['img_metas'])` to indicate `num_samples` (#3073, #3053)
+- Switch to EpochBasedRunner (#2976)
+
+### v2.1.0 (8/6/2020)
+
+__Highlights__
+
+- Support new backbones: [RegNetX](https://arxiv.org/abs/2003.13678), [Res2Net](https://arxiv.org/abs/1904.01169)
+- Support new methods: [NASFCOS](https://arxiv.org/abs/1906.04423), [PISA](https://arxiv.org/abs/1904.04821), [GRoIE](https://arxiv.org/abs/2004.13665)
+- Support new dataset: [LVIS](https://arxiv.org/abs/1908.03195)
+
+__Bug Fixes__
+
+- Change the CLI argument `--validate` to `--no-validate` to enable validation after training epochs by default. (#2651)
+- Add missing cython to docker file (#2713)
+- Fix bug in nms cpu implementation (#2754)
+- Fix bug when showing mask results (#2763)
+- Fix gcc requirement (#2806)
+- Fix bug in async test (#2820)
+- Fix mask encoding-decoding bugs in test API (#2824)
+- Fix bug in test time augmentation (#2858, #2921, #2944)
+- Fix a typo in comment of apis/train (#2877)
+- Fix the bug of returning None when no gt bboxes are in the original image in `RandomCrop`. Fix the bug that misses to handle `gt_bboxes_ignore`, `gt_label_ignore`, and `gt_masks_ignore` in `RandomCrop`, `MinIoURandomCrop` and `Expand` modules. (#2810)
+- Fix bug of `base_channels` of regnet (#2917)
+- Fix the bug of logger when loading pre-trained weights in base detector (#2936)
+
+__New Features__
+
+- Add IoU models (#2666)
+- Add colab demo for inference
+- Support class agnostic nms (#2553)
+- Add benchmark gathering scripts for development only (#2676)
+- Add mmdet-based project links (#2736, #2767, #2895)
+- Add config dump in training (#2779)
+- Add ClassBalancedDataset (#2721)
+- Add res2net backbone (#2237)
+- Support RegNetX models (#2710)
+- Use `mmcv.FileClient` to support different storage backends (#2712)
+- Add ClassBalancedDataset (#2721)
+- Code Release: Prime Sample Attention in Object Detection (CVPR 2020) (#2626)
+- Implement NASFCOS (#2682)
+- Add class weight in CrossEntropyLoss (#2797)
+- Support LVIS dataset (#2088)
+- Support GRoIE (#2584)
+
+__Improvements__
+
+- Allow different x and y strides in anchor heads. (#2629)
+- Make FSAF loss more robust to no gt (#2680)
+- Compute pure inference time instead (#2657) and update inference speed (#2730)
+- Avoided the possibility that a patch with 0 area is cropped. (#2704)
+- Add warnings when deprecated `imgs_per_gpu` is used. (#2700)
+- Add a mask rcnn example for config (#2645)
+- Update model zoo (#2762, #2866, #2876, #2879, #2831)
+- Add `ori_filename` to img_metas and use it in test show-dir (#2612)
+- Use `img_fields` to handle multiple images during image transform (#2800)
+- Add upsample_cfg support in FPN (#2787)
+- Add `['img']` as default `img_fields` for back compatibility (#2809)
+- Rename the pretrained model from `open-mmlab://resnet50_caffe` and `open-mmlab://resnet50_caffe_bgr` to `open-mmlab://detectron/resnet50_caffe` and `open-mmlab://detectron2/resnet50_caffe`. (#2832)
+- Added sleep(2) in test.py to reduce hanging problem (#2847)
+- Support `c10::half` in CARAFE (#2890)
+- Improve documentations (#2918, #2714)
+- Use optimizer constructor in mmcv and clean the original implementation in `mmdet.core.optimizer` (#2947)
+
+### v2.0.0 (6/5/2020)
+
+In this release, we made lots of major refactoring and modifications.
+
+1. __Faster speed__. We optimize the training and inference speed for common models, achieving up to 30% speedup for training and 25% for inference. Please refer to [model zoo](model_zoo.md#comparison-with-detectron2) for details.
+
+2. __Higher performance__. We change some default hyperparameters with no additional cost, which leads to a gain of performance for most models. Please refer to [compatibility](compatibility.md#training-hyperparameters) for details.
+
+3. __More documentation and tutorials__. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://mmdetection.readthedocs.io/en/latest/).
+
+4. __Support PyTorch 1.5__. The support for 1.1 and 1.2 is dropped, and we switch to some new APIs.
+
+5. __Better configuration system__. Inheritance is supported to reduce the redundancy of configs.
+
+6. __Better modular design__. Towards the goal of simplicity and flexibility, we simplify some encapsulation while add more other configurable modules like BBoxCoder, IoUCalculator, OptimizerConstructor, RoIHead. Target computation is also included in heads and the call hierarchy is simpler.
+
+7. Support new methods: [FSAF](https://arxiv.org/abs/1903.00621) and PAFPN (part of [PAFPN](https://arxiv.org/abs/1803.01534)).
+
+__Breaking Changes__
+Models training with MMDetection 1.x are not fully compatible with 2.0, please refer to the [compatibility doc](compatibility.md) for the details and how to migrate to the new version.
+
+__Improvements__
+
+- Unify cuda and cpp API for custom ops. (#2277)
+- New config files with inheritance. (#2216)
+- Encapsulate the second stage into RoI heads. (#1999)
+- Refactor GCNet/EmpericalAttention into plugins. (#2345)
+- Set low quality match as an option in IoU-based bbox assigners. (#2375)
+- Change the codebase's coordinate system. (#2380)
+- Refactor the category order in heads. 0 means the first positive class instead of background now. (#2374)
+- Add bbox sampler and assigner registry. (#2419)
+- Speed up the inference of RPN. (#2420)
+- Add `train_cfg` and `test_cfg` as class members in all anchor heads. (#2422)
+- Merge target computation methods into heads. (#2429)
+- Add bbox coder to support different bbox encoding and losses. (#2480)
+- Unify the API for regression loss. (#2156)
+- Refactor Anchor Generator. (#2474)
+- Make `lr` an optional argument for optimizers. (#2509)
+- Migrate to modules and methods in MMCV. (#2502, #2511, #2569, #2572)
+- Support PyTorch 1.5. (#2524)
+- Drop the support for Python 3.5 and use F-string in the codebase. (#2531)
+
+__Bug Fixes__
+
+- Fix the scale factors for resized images without keep the aspect ratio. (#2039)
+- Check if max_num > 0 before slicing in NMS. (#2486)
+- Fix Deformable RoIPool when there is no instance. (#2490)
+- Fix the default value of assigned labels. (#2536)
+- Fix the evaluation of Cityscapes. (#2578)
+
+__New Features__
+
+- Add deep_stem and avg_down option to ResNet, i.e., support ResNetV1d. (#2252)
+- Add L1 loss. (#2376)
+- Support both polygon and bitmap for instance masks. (#2353, #2540)
+- Support CPU mode for inference. (#2385)
+- Add optimizer constructor for complicated configuration of optimizers. (#2397, #2488)
+- Implement PAFPN. (#2392)
+- Support empty tensor input for some modules. (#2280)
+- Support for custom dataset classes without overriding it. (#2408, #2443)
+- Support to train subsets of coco dataset. (#2340)
+- Add iou_calculator to potentially support more IoU calculation methods. (2405)
+- Support class wise mean AP (was removed in the last version). (#2459)
+- Add option to save the testing result images. (#2414)
+- Support MomentumUpdaterHook. (#2571)
+- Add a demo to inference a single image. (#2605)
+
+### v1.1.0 (24/2/2020)
+
+__Highlights__
+
+- Dataset evaluation is rewritten with a unified api, which is used by both evaluation hooks and test scripts.
+- Support new methods: [CARAFE](https://arxiv.org/abs/1905.02188).
+
+__Breaking Changes__
+
+- The new MMDDP inherits from the official DDP, thus the `__init__` api is changed to be the same as official DDP.
+- The `mask_head` field in HTC config files is modified.
+- The evaluation and testing script is updated.
+- In all transforms, instance masks are stored as a numpy array shaped (n, h, w) instead of a list of (h, w) arrays, where n is the number of instances.
+
+__Bug Fixes__
+
+- Fix IOU assigners when ignore_iof_thr > 0 and there is no pred boxes. (#2135)
+- Fix mAP evaluation when there are no ignored boxes. (#2116)
+- Fix the empty RoI input for Deformable RoI Pooling. (#2099)
+- Fix the dataset settings for multiple workflows. (#2103)
+- Fix the warning related to `torch.uint8` in PyTorch 1.4. (#2105)
+- Fix the inference demo on devices other than gpu:0. (#2098)
+- Fix Dockerfile. (#2097)
+- Fix the bug that `pad_val` is unused in Pad transform. (#2093)
+- Fix the albumentation transform when there is no ground truth bbox. (#2032)
+
+__Improvements__
+
+- Use torch instead of numpy for random sampling. (#2094)
+- Migrate to the new MMDDP implementation in MMCV v0.3. (#2090)
+- Add meta information in logs. (#2086)
+- Rewrite Soft NMS with pytorch extension and remove cython as a dependency. (#2056)
+- Rewrite dataset evaluation. (#2042, #2087, #2114, #2128)
+- Use numpy array for masks in transforms. (#2030)
+
+__New Features__
+
+- Implement "CARAFE: Content-Aware ReAssembly of FEatures". (#1583)
+- Add `worker_init_fn()` in data_loader when seed is set. (#2066, #2111)
+- Add logging utils. (#2035)
+
+### v1.0.0 (30/1/2020)
+
+This release mainly improves the code quality and add more docstrings.
+
+__Highlights__
+
+- Documentation is online now: <https://mmdetection.readthedocs.io>.
+- Support new models: [ATSS](https://arxiv.org/abs/1912.02424).
+- DCN is now available with the api `build_conv_layer` and `ConvModule` like the normal conv layer.
+- A tool to collect environment information is available for trouble shooting.
+
+__Bug Fixes__
+
+- Fix the incompatibility of the latest numpy and pycocotools. (#2024)
+- Fix the case when distributed package is unavailable, e.g., on Windows. (#1985)
+- Fix the dimension issue for `refine_bboxes()`. (#1962)
+- Fix the typo when `seg_prefix` is a list. (#1906)
+- Add segmentation map cropping to RandomCrop. (#1880)
+- Fix the return value of `ga_shape_target_single()`. (#1853)
+- Fix the loaded shape of empty proposals. (#1819)
+- Fix the mask data type when using albumentation. (#1818)
+
+__Improvements__
+
+- Enhance AssignResult and SamplingResult. (#1995)
+- Add ability to overwrite existing module in Registry. (#1982)
+- Reorganize requirements and make albumentations and imagecorruptions optional. (#1969)
+- Check NaN in `SSDHead`. (#1935)
+- Encapsulate the DCN in ResNe(X)t into a ConvModule & Conv_layers. (#1894)
+- Refactoring for mAP evaluation and support multiprocessing and logging. (#1889)
+- Init the root logger before constructing Runner to log more information. (#1865)
+- Split `SegResizeFlipPadRescale` into different existing transforms. (#1852)
+- Move `init_dist()` to MMCV. (#1851)
+- Documentation and docstring improvements. (#1971, #1938, #1869, #1838)
+- Fix the color of the same class for mask visualization. (#1834)
+- Remove the option `keep_all_stages` in HTC and Cascade R-CNN. (#1806)
+
+__New Features__
+
+- Add two test-time options `crop_mask` and `rle_mask_encode` for mask heads. (#2013)
+- Support loading grayscale images as single channel. (#1975)
+- Implement "Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection". (#1872)
+- Add sphinx generated docs. (#1859, #1864)
+- Add GN support for flops computation. (#1850)
+- Collect env info for trouble shooting. (#1812)
+
+### v1.0rc1 (13/12/2019)
+
+The RC1 release mainly focuses on improving the user experience, and fixing bugs.
+
+__Highlights__
+
+- Support new models: [FoveaBox](https://arxiv.org/abs/1904.03797), [RepPoints](https://arxiv.org/abs/1904.11490) and [FreeAnchor](https://arxiv.org/abs/1909.02466).
+- Add a Dockerfile.
+- Add a jupyter notebook demo and a webcam demo.
+- Setup the code style and CI.
+- Add lots of docstrings and unit tests.
+- Fix lots of bugs.
+
+__Breaking Changes__
+
+- There was a bug for computing COCO-style mAP w.r.t different scales (AP_s, AP_m, AP_l), introduced by #621. (#1679)
+
+__Bug Fixes__
+
+- Fix a sampling interval bug in Libra R-CNN. (#1800)
+- Fix the learning rate in SSD300 WIDER FACE. (#1781)
+- Fix the scaling issue when `keep_ratio=False`. (#1730)
+- Fix typos. (#1721, #1492, #1242, #1108, #1107)
+- Fix the shuffle argument in `build_dataloader`. (#1693)
+- Clip the proposal when computing mask targets. (#1688)
+- Fix the "index out of range" bug for samplers in some corner cases. (#1610, #1404)
+- Fix the NMS issue on devices other than GPU:0. (#1603)
+- Fix SSD Head and GHM Loss on CPU. (#1578)
+- Fix the OOM error when there are too many gt bboxes. (#1575)
+- Fix the wrong keyword argument `nms_cfg` in HTC. (#1573)
+- Process masks and semantic segmentation in Expand and MinIoUCrop transforms. (#1550, #1361)
+- Fix a scale bug in the Non Local op. (#1528)
+- Fix a bug in transforms when `gt_bboxes_ignore` is None. (#1498)
+- Fix a bug when `img_prefix` is None. (#1497)
+- Pass the device argument to `grid_anchors` and `valid_flags`. (#1478)
+- Fix the data pipeline for test_robustness. (#1476)
+- Fix the argument type of deformable pooling. (#1390)
+- Fix the coco_eval when there are only two classes. (#1376)
+- Fix a bug in Modulated DeformableConv when deformable_group>1. (#1359)
+- Fix the mask cropping in RandomCrop. (#1333)
+- Fix zero outputs in DeformConv when not running on cuda:0. (#1326)
+- Fix the type issue in Expand. (#1288)
+- Fix the inference API. (#1255)
+- Fix the inplace operation in Expand. (#1249)
+- Fix the from-scratch training config. (#1196)
+- Fix inplace add in RoIExtractor which cause an error in PyTorch 1.2. (#1160)
+- Fix FCOS when input images has no positive sample. (#1136)
+- Fix recursive imports. (#1099)
+
+__Improvements__
+
+- Print the config file and mmdet version in the log. (#1721)
+- Lint the code before compiling in travis CI. (#1715)
+- Add a probability argument for the `Expand` transform. (#1651)
+- Update the PyTorch and CUDA version in the docker file. (#1615)
+- Raise a warning when specifying `--validate` in non-distributed training. (#1624, #1651)
+- Beautify the mAP printing. (#1614)
+- Add pre-commit hook. (#1536)
+- Add the argument `in_channels` to backbones. (#1475)
+- Add lots of docstrings and unit tests, thanks to [@Erotemic](https://github.com/Erotemic). (#1603, #1517, #1506, #1505, #1491, #1479, #1477, #1475, #1474)
+- Add support for multi-node distributed test when there is no shared storage. (#1399)
+- Optimize Dockerfile to reduce the image size. (#1306)
+- Update new results of HRNet. (#1284, #1182)
+- Add an argument `no_norm_on_lateral` in FPN. (#1240)
+- Test the compiling in CI. (#1235)
+- Move docs to a separate folder. (#1233)
+- Add a jupyter notebook demo. (#1158)
+- Support different type of dataset for training. (#1133)
+- Use int64_t instead of long in cuda kernels. (#1131)
+- Support unsquare RoIs for bbox and mask heads. (#1128)
+- Manually add type promotion to make compatible to PyTorch 1.2. (#1114)
+- Allowing validation dataset for computing validation loss. (#1093)
+- Use `.scalar_type()` instead of `.type()` to suppress some warnings. (#1070)
+
+__New Features__
+
+- Add an option `--with_ap` to compute the AP for each class. (#1549)
+- Implement "FreeAnchor: Learning to Match Anchors for Visual Object Detection". (#1391)
+- Support [Albumentations](https://github.com/albumentations-team/albumentations) for augmentations in the data pipeline. (#1354)
+- Implement "FoveaBox: Beyond Anchor-based Object Detector". (#1339)
+- Support horizontal and vertical flipping. (#1273, #1115)
+- Implement "RepPoints: Point Set Representation for Object Detection". (#1265)
+- Add test-time augmentation to HTC and Cascade R-CNN. (#1251)
+- Add a COCO result analysis tool. (#1228)
+- Add Dockerfile. (#1168)
+- Add a webcam demo. (#1155, #1150)
+- Add FLOPs counter. (#1127)
+- Allow arbitrary layer order for ConvModule. (#1078)
+
+### v1.0rc0 (27/07/2019)
+
+- Implement lots of new methods and components (Mixed Precision Training, HTC, Libra R-CNN, Guided Anchoring, Empirical Attention, Mask Scoring R-CNN, Grid R-CNN (Plus), GHM, GCNet, FCOS, HRNet, Weight Standardization, etc.). Thank all collaborators!
+- Support two additional datasets: WIDER FACE and Cityscapes.
+- Refactoring for loss APIs and make it more flexible to adopt different losses and related hyper-parameters.
+- Speed up multi-gpu testing.
+- Integrate all compiling and installing in a single script.
+
+### v0.6.0 (14/04/2019)
+
+- Up to 30% speedup compared to the model zoo.
+- Support both PyTorch stable and nightly version.
+- Replace NMS and SigmoidFocalLoss with Pytorch CUDA extensions.
+
+### v0.6rc0(06/02/2019)
+
+- Migrate to PyTorch 1.0.
+
+### v0.5.7 (06/02/2019)
+
+- Add support for Deformable ConvNet v2. (Many thanks to the authors and [@chengdazhi](https://github.com/chengdazhi))
+- This is the last release based on PyTorch 0.4.1.
+
+### v0.5.6 (17/01/2019)
+
+- Add support for Group Normalization.
+- Unify RPNHead and single stage heads (RetinaHead, SSDHead) with AnchorHead.
+
+### v0.5.5 (22/12/2018)
+
+- Add SSD for COCO and PASCAL VOC.
+- Add ResNeXt backbones and detection models.
+- Refactoring for Samplers/Assigners and add OHEM.
+- Add VOC dataset and evaluation scripts.
+
+### v0.5.4 (27/11/2018)
+
+- Add SingleStageDetector and RetinaNet.
+
+### v0.5.3 (26/11/2018)
+
+- Add Cascade R-CNN and Cascade Mask R-CNN.
+- Add support for Soft-NMS in config files.
+
+### v0.5.2 (21/10/2018)
+
+- Add support for custom datasets.
+- Add a script to convert PASCAL VOC annotations to the expected format.
+
+### v0.5.1 (20/10/2018)
+
+- Add BBoxAssigner and BBoxSampler, the `train_cfg` field in config files are restructured.
+- `ConvFCRoIHead` / `SharedFCRoIHead` are renamed to `ConvFCBBoxHead` / `SharedFCBBoxHead` for consistency.
diff --git a/docs/en/compatibility.md b/docs/en/compatibility.md
new file mode 100755
index 0000000..a545a49
--- /dev/null
+++ b/docs/en/compatibility.md
@@ -0,0 +1,178 @@
+# Compatibility of MMDetection 2.x
+
+## MMDetection 2.25.0
+
+In order to support Mask2Former for instance segmentation, the original config files of Mask2Former for panpotic segmentation need to be renamed [PR #7571](https://github.com/open-mmlab/mmdetection/pull/7571).
+
+<table align="center">
+    <thead>
+        <tr align='center'>
+            <td>before v2.25.0</td>
+            <td>after v2.25.0</td>
+        </tr>
+    </thead>
+    <tbody><tr valign='top'>
+    <th>
+
+```
+'mask2former_xxx_coco.py' represents config files for **panoptic segmentation**.
+```
+
+</th>
+    <th>
+
+```
+'mask2former_xxx_coco.py' represents config files for **instance segmentation**.
+'mask2former_xxx_coco-panoptic.py' represents config files for **panoptic segmentation**.
+```
+
+</th></tr>
+  </tbody></table>
+
+## MMDetection 2.21.0
+
+In order to support CPU training, the logic of scatter in batch collating has been changed. We recommend to use
+MMCV v1.4.4 or higher. For more details, please refer to [MMCV PR #1621](https://github.com/open-mmlab/mmcv/pull/1621).
+
+## MMDetection 2.18.1
+
+### MMCV compatibility
+
+In order to fix the wrong weight reference bug in BaseTransformerLayer, the logic in batch first mode of MultiheadAttention has been changed.
+We recommend to use MMCV v1.3.17 or higher. For more details, please refer to [MMCV PR #1418](https://github.com/open-mmlab/mmcv/pull/1418).
+
+## MMDetection 2.18.0
+
+### DIIHead compatibility
+
+In order to support QueryInst, attn_feats is added into the returned tuple of DIIHead.
+
+## MMDetection 2.14.0
+
+### MMCV Version
+
+In order to fix the problem that the priority of EvalHook is too low, all hook priorities have been re-adjusted in 1.3.8, so MMDetection 2.14.0 needs to rely on the latest MMCV 1.3.8 version. For related information, please refer to [#1120](https://github.com/open-mmlab/mmcv/pull/1120), for related issues, please refer to [#5343](https://github.com/open-mmlab/mmdetection/issues/5343).
+
+### SSD compatibility
+
+In v2.14.0, to make SSD more flexible to use, [PR5291](https://github.com/open-mmlab/mmdetection/pull/5291) refactored its backbone, neck and head. The users can use the script `tools/model_converters/upgrade_ssd_version.py` to convert their models.
+
+```bash
+python tools/model_converters/upgrade_ssd_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH}
+```
+
+- OLD_MODEL_PATH: the path to load the old version SSD model.
+- NEW_MODEL_PATH: the path to save the converted model weights.
+
+## MMDetection 2.12.0
+
+MMDetection is going through big refactoring for more general and convenient usages during the releases from v2.12.0 to v2.18.0 (maybe longer).
+In v2.12.0 MMDetection inevitably brings some BC-breakings, including the MMCV dependency, model initialization, model registry, and mask AP evaluation.
+
+### MMCV Version
+
+MMDetection v2.12.0 relies on the newest features in MMCV 1.3.3, including `BaseModule` for unified parameter initialization, model registry, and the CUDA operator `MultiScaleDeformableAttn` for [Deformable DETR](https://arxiv.org/abs/2010.04159). Note that MMCV 1.3.2 already contains all the features used by MMDet but has known issues. Therefore, we recommend users to skip MMCV v1.3.2 and use v1.3.2, though v1.3.2 might work for most of the cases.
+
+### Unified model initialization
+
+To unify the parameter initialization in OpenMMLab projects, MMCV supports `BaseModule` that accepts `init_cfg` to allow the modules' parameters initialized in a flexible and unified manner. Now the users need to explicitly call `model.init_weights()` in the training script to initialize the model (as in [here](https://github.com/open-mmlab/mmdetection/blob/master/tools/train.py#L162), previously this was handled by the detector. **The downstream projects must update their model initialization accordingly to use MMDetection v2.12.0**. Please refer to PR #4750 for details.
+
+### Unified model registry
+
+To easily use backbones implemented in other OpenMMLab projects, MMDetection v2.12.0 inherits the model registry created in MMCV (#760). In this way, as long as the backbone is supported in an OpenMMLab project and that project also uses the registry in MMCV, users can use that backbone in MMDetection by simply modifying the config without copying the code of that backbone into MMDetection. Please refer to PR #5059 for more details.
+
+### Mask AP evaluation
+
+Before [PR 4898](https://github.com/open-mmlab/mmdetection/pull/4898) and V2.12.0, the mask AP of small, medium, and large instances is calculated based on the bounding box area rather than the real mask area. This leads to higher `APs` and `APm` but lower `APl` but will not affect the overall mask AP. [PR 4898](https://github.com/open-mmlab/mmdetection/pull/4898) change it to use mask areas by deleting `bbox` in mask AP calculation.
+The new calculation does not affect the overall mask AP evaluation and is consistent with [Detectron2](https://github.com/facebookresearch/detectron2/).
+
+## Compatibility with MMDetection 1.x
+
+MMDetection 2.0 goes through a big refactoring and addresses many legacy issues. It is not compatible with the 1.x version, i.e., running inference with the same model weights in these two versions will produce different results. Thus, MMDetection 2.0 re-benchmarks all the models and provides their links and logs in the model zoo.
+
+The major differences are in four folds: coordinate system, codebase conventions, training hyperparameters, and modular design.
+
+### Coordinate System
+
+The new coordinate system is consistent with [Detectron2](https://github.com/facebookresearch/detectron2/) and treats the center of the most left-top pixel as (0, 0) rather than the left-top corner of that pixel.
+Accordingly, the system interprets the coordinates in COCO bounding box and segmentation annotations as coordinates in range `[0, width]` or `[0, height]`.
+This modification affects all the computation related to the bbox and pixel selection,
+which is more natural and accurate.
+
+- The height and width of a box with corners (x1, y1) and (x2, y2) in the new coordinate system is computed as `width = x2 - x1` and `height = y2 - y1`.
+  In MMDetection 1.x and previous version, a "+ 1" was added both height and width.
+  This modification are in three folds:
+
+  1. Box transformation and encoding/decoding in regression.
+  2. IoU calculation. This affects the matching process between ground truth and bounding box and the NMS process. The effect to compatibility is very negligible, though.
+  3. The corners of bounding box is in float type and no longer quantized. This should provide more accurate bounding box results. This also makes the bounding box and RoIs not required to have minimum size of 1, whose effect is small, though.
+
+- The anchors are center-aligned to feature grid points and in float type.
+  In MMDetection 1.x and previous version, the anchors are in `int` type and not center-aligned.
+  This affects the anchor generation in RPN and all the anchor-based methods.
+
+- ROIAlign is better aligned with the image coordinate system. The new implementation is adopted from [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign).
+  The RoIs are shifted by half a pixel by default when they are used to cropping RoI features, compared to MMDetection 1.x.
+  The old behavior is still available by setting `aligned=False` instead of `aligned=True`.
+
+- Mask cropping and pasting are more accurate.
+
+  1. We use the new RoIAlign to crop mask targets. In MMDetection 1.x, the bounding box is quantized before it is used to crop mask target, and the crop process is implemented by numpy. In new implementation, the bounding box for crop is not quantized and sent to RoIAlign. This implementation accelerates the training speed by a large margin (~0.1s per iter, ~2 hour when training Mask R50 for 1x schedule) and should be more accurate.
+
+  2. In MMDetection 2.0, the "`paste_mask()`" function is different and should be more accurate than those in previous versions. This change follows the modification in [Detectron2](https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/masks.py) and can improve mask AP on COCO by ~0.5% absolute.
+
+### Codebase Conventions
+
+- MMDetection 2.0 changes the order of class labels to reduce unused parameters in regression and mask branch more naturally (without +1 and -1).
+  This effect all the classification layers of the model to have a different ordering of class labels. The final layers of regression branch and mask head no longer keep K+1 channels for K categories, and their class orders are consistent with the classification branch.
+
+  - In MMDetection 2.0, label "K" means background, and labels \[0, K-1\] correspond to the K = num_categories object categories.
+
+  - In MMDetection 1.x and previous version, label "0" means background, and labels \[1, K\] correspond to the K categories.
+
+  - **Note**: The class order of softmax RPN is still the same as that in 1.x in versions\<=2.4.0 while sigmoid RPN is not affected. The class orders in all heads are unified since MMDetection v2.5.0.
+
+- Low quality matching in R-CNN is not used. In MMDetection 1.x and previous versions, the `max_iou_assigner` will match low quality boxes for each ground truth box in both RPN and R-CNN training. We observe this sometimes does not assign the most perfect GT box to some bounding boxes,
+  thus MMDetection 2.0 do not allow low quality matching by default in R-CNN training in the new system. This sometimes may slightly improve the box AP (~0.1% absolute).
+
+- Separate scale factors for width and height. In MMDetection 1.x and previous versions, the scale factor is a single float in mode `keep_ratio=True`. This is slightly inaccurate because the scale factors for width and height have slight difference. MMDetection 2.0 adopts separate scale factors for width and height, the improvement on AP ~0.1% absolute.
+
+- Configs name conventions are changed. MMDetection V2.0 adopts the new name convention to maintain the gradually growing model zoo as the following:
+
+  ```shell
+  [model]_(model setting)_[backbone]_[neck]_(norm setting)_(misc)_(gpu x batch)_[schedule]_[dataset].py,
+  ```
+
+  where the (`misc`) includes DCN and GCBlock, etc. More details are illustrated in the [documentation for config](tutorials/config)
+
+- MMDetection V2.0 uses new ResNet Caffe backbones to reduce warnings when loading pre-trained models. Most of the new backbones' weights are the same as the former ones but do not have `conv.bias`, except that they use a different `img_norm_cfg`. Thus, the new backbone will not cause warning of unexpected keys.
+
+### Training Hyperparameters
+
+The change in training hyperparameters does not affect
+model-level compatibility but slightly improves the performance. The major ones are:
+
+- The number of proposals after nms is changed from 2000 to 1000 by setting `nms_post=1000` and `max_num=1000`.
+  This slightly improves both mask AP and bbox AP by ~0.2% absolute.
+
+- The default box regression losses for Mask R-CNN, Faster R-CNN and RetinaNet are changed from smooth L1 Loss to L1 loss. This leads to an overall improvement in box AP (~0.6% absolute). However, using L1-loss for other methods such as Cascade R-CNN and HTC does not improve the performance, so we keep the original settings for these methods.
+
+- The sample num of RoIAlign layer is set to be 0 for simplicity. This leads to slightly improvement on mask AP (~0.2% absolute).
+
+- The default setting does not use gradient clipping anymore during training for faster training speed. This does not degrade performance of the most of models. For some models such as RepPoints we keep using gradient clipping to stabilize the training process and to obtain better performance.
+
+- The default warmup ratio is changed from 1/3 to 0.001 for a more smooth warming up process since the gradient clipping is usually not used. The effect is found negligible during our re-benchmarking, though.
+
+### Upgrade Models from 1.x to 2.0
+
+To convert the models trained by MMDetection V1.x to MMDetection V2.0, the users can use the script `tools/model_converters/upgrade_model_version.py` to convert
+their models. The converted models can be run in MMDetection V2.0 with slightly dropped performance (less than 1% AP absolute).
+Details can be found in `configs/legacy`.
+
+## pycocotools compatibility
+
+`mmpycocotools` is the OpenMMlab's fork of official `pycocotools`, which works for both MMDetection and Detectron2.
+Before [PR 4939](https://github.com/open-mmlab/mmdetection/pull/4939), since `pycocotools` and `mmpycocotool` have the same package name, if users already installed `pycocotools` (installed Detectron2 first under the same environment), then the setup of MMDetection will skip installing `mmpycocotool`. Thus MMDetection fails due to the missing `mmpycocotools`.
+If MMDetection is installed before Detectron2, they could work under the same environment.
+[PR 4939](https://github.com/open-mmlab/mmdetection/pull/4939) deprecates mmpycocotools in favor of official pycocotools.
+Users may install MMDetection and Detectron2 under the same environment after [PR 4939](https://github.com/open-mmlab/mmdetection/pull/4939), no matter what the installation order is.
diff --git a/docs/en/conf.py b/docs/en/conf.py
new file mode 100755
index 0000000..e902e3f
--- /dev/null
+++ b/docs/en/conf.py
@@ -0,0 +1,116 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMDetection'
+copyright = '2018-2021, OpenMMLab'
+author = 'MMDetection Authors'
+version_file = '../../mmdet/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+# The full version, including alpha/beta/rc tags
+release = get_version()
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton',
+]
+
+myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
+
+autodoc_mock_imports = [
+    'matplotlib', 'pycocotools', 'terminaltables', 'mmdet.version', 'mmcv.ops'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'sphinx_rtd_theme'
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmdetection'
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang':
+    'en'
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = ['css/readthedocs.css']
+
+# -- Extension configuration -------------------------------------------------
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./stat.py'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
diff --git a/docs/en/conventions.md b/docs/en/conventions.md
new file mode 100755
index 0000000..97e5fd0
--- /dev/null
+++ b/docs/en/conventions.md
@@ -0,0 +1,78 @@
+# Conventions
+
+Please check the following conventions if you would like to modify MMDetection as your own project.
+
+## Loss
+
+In MMDetection, a `dict` containing losses and metrics will be returned by `model(**data)`.
+
+For example, in bbox head,
+
+```python
+class BBoxHead(nn.Module):
+    ...
+    def loss(self, ...):
+        losses = dict()
+        # classification loss
+        losses['loss_cls'] = self.loss_cls(...)
+        # classification accuracy
+        losses['acc'] = accuracy(...)
+        # bbox regression loss
+        losses['loss_bbox'] = self.loss_bbox(...)
+        return losses
+```
+
+`bbox_head.loss()` will be called during model forward.
+The returned dict contains `'loss_bbox'`, `'loss_cls'`, `'acc'` .
+Only `'loss_bbox'`, `'loss_cls'` will be used during back propagation,
+`'acc'` will only be used as a metric to monitor training process.
+
+By default, only values whose keys contain `'loss'` will be back propagated.
+This behavior could be changed by modifying `BaseDetector.train_step()`.
+
+## Empty Proposals
+
+In MMDetection, We have added special handling and unit test for empty proposals of two-stage. We need to deal with the empty proposals of the entire batch and single image at the same time. For example, in CascadeRoIHead,
+
+```python
+# simple_test method
+...
+# There is no proposal in the whole batch
+if rois.shape[0] == 0:
+    bbox_results = [[
+        np.zeros((0, 5), dtype=np.float32)
+        for _ in range(self.bbox_head[-1].num_classes)
+    ]] * num_imgs
+    if self.with_mask:
+        mask_classes = self.mask_head[-1].num_classes
+        segm_results = [[[] for _ in range(mask_classes)]
+                        for _ in range(num_imgs)]
+        results = list(zip(bbox_results, segm_results))
+    else:
+        results = bbox_results
+    return results
+...
+
+# There is no proposal in the single image
+for i in range(self.num_stages):
+    ...
+    if i < self.num_stages - 1:
+          for j in range(num_imgs):
+                   # Handle empty proposal
+                   if rois[j].shape[0] > 0:
+                       bbox_label = cls_score[j][:, :-1].argmax(dim=1)
+                       refine_roi = self.bbox_head[i].regress_by_class(
+                            rois[j], bbox_label, bbox_pred[j], img_metas[j])
+                       refine_roi_list.append(refine_roi)
+```
+
+If you have customized `RoIHead`, you can refer to the above method to deal with empty proposals.
+
+## Coco Panoptic Dataset
+
+In MMDetection, we have supported COCO Panoptic dataset. We clarify a few conventions about the implementation of `CocoPanopticDataset` here.
+
+1. For mmdet\<=2.16.0, the range of foreground and background labels in semantic segmentation are different from the default setting of MMDetection. The label `0` stands for `VOID` label and the category labels start from `1`.
+   Since mmdet=2.17.0, the category labels of semantic segmentation start from `0` and label `255` stands for `VOID` for consistency with labels of bounding boxes.
+   To achieve that, the `Pad` pipeline supports setting the padding value for `seg`.
+2. In the evaluation, the panoptic result is a map with the same shape as the original image. Each value in the result map has the format of `instance_id * INSTANCE_OFFSET + category_id`.
diff --git a/docs/en/device/npu.md b/docs/en/device/npu.md
new file mode 100755
index 0000000..b0129b7
--- /dev/null
+++ b/docs/en/device/npu.md
@@ -0,0 +1,55 @@
+# NPU (HUAWEI Ascend)
+
+## Usage
+
+Please refer to the [building documentation of MMCV](https://mmcv.readthedocs.io/en/latest/get_started/build.html#build-mmcv-full-on-ascend-npu-machine) to install MMCV on NPU devices
+
+Here we use 8 NPUs on your computer to train the model with the following command:
+
+```shell
+bash tools/dist_train.sh configs/ssd/ssd300_coco.py 8
+```
+
+Also, you can use only one NPU to train the model with the following command:
+
+```shell
+python tools/train.py configs/ssd/ssd300_coco.py
+```
+
+## Models Results
+
+|        Model         | box AP | mask AP | Config                                                                                                                        | Download                                                                                                     |
+| :------------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------- |
+|     [ssd300](<>)     |  25.6  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ssd300_fp16_coco.py)                               | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/ssd300_coco.log.json)                              |
+|     [ssd512](<>)     |  29.4  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ssd512_fp16_coco.py)                               | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/ssd512_coco.log.json)                              |
+| [ssdlite-mbv2\*](<>) |  20.2  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py)          | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/ssdlite_mobilenetv2_scratch_600e_coco.log.json)    |
+| [retinanet-r18](<>)  |  31.8  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py)            | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/retinanet_r18_fpn_1x8_1x_coco.log.json)            |
+| [retinanet-r50](<>)  |  36.6  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py)           | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/retinanet_r50_fpn_1x_coco.log.json)                |
+|   [yolov3-608](<>)   |  34.7  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py)         | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/yolov3_d53_fp16_mstrain-608_273e_coco.log.json)    |
+|  [yolox-s\*\*](<>)   |  39.9  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolox/yolox_s_8x8_300e_coco.py)                        | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/yolox_s_8x8_300e_coco.log.json)                    |
+| [centernet-r18](<>)  |  26.1  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/centernet/centernet_resnet18_140e_cocoo.py)            | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/centernet_resnet18_140e_coco.log.jsonn)            |
+|   [fcos-r50\*](<>)   |  36.1  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/fcos/fcos_r50_caffe_fpn_gn-head_fp16_1x_bs8x8_coco.py) | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/fcos_r50_caffe_fpn_gn-head_1x_coco_bs8x8.log.json) |
+|   [solov2-r50](<>)   |  ---   |  34.7   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/solov2/solov2_r50_fpn_1x_coco.py)                      | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/solov2_r50_fpn_1x_coco.log.json)                   |
+
+**Notes:**
+
+- If not specially marked, the results on NPU are the same as those on the GPU with FP32.
+- (\*) The results on the NPU of these models are aligned with the results of the mixed-precision training on the GPU,
+  but are lower than the results of the FP32. This situation is mainly related to the phase of the model itself in
+  mixed-precision training, users may need to adjust the hyperparameters to achieve better results.
+- (\*\*) The accuracy of yolox-s on the GPU in mixed precision is 40.1, with `persister_woker=True` in the data loader config by default.
+  There are currently some bugs on NPUs that prevent the last few epochs from running, but the accuracy is less affected and the difference can be ignored.
+
+## High-performance Model Result on Ascend Device
+
+Introduction to optimization:
+
+1. Modify the loop calculation as a whole batch calculation to reduce the number of instructions issued.
+2. Modify the index calculation to mask calculation, because the SIMD architecture is good at processing continuous data calculation.
+
+|           Model            |                                                          Config                                                           | v100 iter time |       910A iter time       |
+| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------: | :------------: | :------------------------: |
+|    [ascend-ssd300](<>)     |          [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ascend_ssd300_fp16_coco.py)           |  0.165s/iter   | 0.383s/iter -> 0.13s/iter  |
+| [ascend-retinanet-r18](<>) | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/ascend_retinanet_r18_fpn_1x8_1x_coco.py) |  0.567s/iter   | 0.780s/iter -> 0.420s/iter |
+
+**All above models are provided by Huawei Ascend group.**
diff --git a/docs/en/faq.md b/docs/en/faq.md
new file mode 100755
index 0000000..b8afefc
--- /dev/null
+++ b/docs/en/faq.md
@@ -0,0 +1,240 @@
+# Frequently Asked Questions
+
+We list some common troubles faced by many users and their corresponding solutions here. Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. If the contents here do not cover your issue, please create an issue using the [provided templates](https://github.com/open-mmlab/mmdetection/blob/master/.github/ISSUE_TEMPLATE/error-report.md/) and make sure you fill in all required information in the template.
+
+## Installation
+
+- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"; "AssertionError: MMCV==xxx is used but incompatible. Please install mmcv>=xxx, \<=xxx."
+
+  Compatible MMDetection and MMCV versions are shown as below. Please choose the correct version of MMCV to avoid installation issues.
+
+| MMDetection version |        MMCV version        |
+| :-----------------: | :------------------------: |
+|       master        | mmcv-full>=1.3.17, \<1.8.0 |
+|       2.28.2        | mmcv-full>=1.3.17, \<1.8.0 |
+|       2.28.1        | mmcv-full>=1.3.17, \<1.8.0 |
+|       2.28.0        | mmcv-full>=1.3.17, \<1.8.0 |
+|       2.27.0        | mmcv-full>=1.3.17, \<1.8.0 |
+|       2.26.0        | mmcv-full>=1.3.17, \<1.8.0 |
+|       2.25.3        | mmcv-full>=1.3.17, \<1.7.0 |
+|       2.25.2        | mmcv-full>=1.3.17, \<1.7.0 |
+|       2.25.1        | mmcv-full>=1.3.17, \<1.6.0 |
+|       2.25.0        | mmcv-full>=1.3.17, \<1.6.0 |
+|       2.24.1        | mmcv-full>=1.3.17, \<1.6.0 |
+|       2.24.0        | mmcv-full>=1.3.17, \<1.6.0 |
+|       2.23.0        | mmcv-full>=1.3.17, \<1.5.0 |
+|       2.22.0        | mmcv-full>=1.3.17, \<1.5.0 |
+|       2.21.0        | mmcv-full>=1.3.17, \<1.5.0 |
+|       2.20.0        | mmcv-full>=1.3.17, \<1.5.0 |
+|       2.19.1        | mmcv-full>=1.3.17, \<1.5.0 |
+|       2.19.0        | mmcv-full>=1.3.17, \<1.5.0 |
+|       2.18.0        | mmcv-full>=1.3.17, \<1.4.0 |
+|       2.17.0        | mmcv-full>=1.3.14, \<1.4.0 |
+|       2.16.0        | mmcv-full>=1.3.8, \<1.4.0  |
+|       2.15.1        | mmcv-full>=1.3.8, \<1.4.0  |
+|       2.15.0        | mmcv-full>=1.3.8, \<1.4.0  |
+|       2.14.0        | mmcv-full>=1.3.8, \<1.4.0  |
+|       2.13.0        | mmcv-full>=1.3.3, \<1.4.0  |
+|       2.12.0        | mmcv-full>=1.3.3, \<1.4.0  |
+|       2.11.0        | mmcv-full>=1.2.4, \<1.4.0  |
+|       2.10.0        | mmcv-full>=1.2.4, \<1.4.0  |
+|        2.9.0        | mmcv-full>=1.2.4, \<1.4.0  |
+|        2.8.0        | mmcv-full>=1.2.4, \<1.4.0  |
+|        2.7.0        | mmcv-full>=1.1.5, \<1.4.0  |
+|        2.6.0        | mmcv-full>=1.1.5, \<1.4.0  |
+|        2.5.0        | mmcv-full>=1.1.5, \<1.4.0  |
+|        2.4.0        | mmcv-full>=1.1.1, \<1.4.0  |
+|        2.3.0        |      mmcv-full==1.0.5      |
+|      2.3.0rc0       |      mmcv-full>=1.0.2      |
+|        2.2.1        |        mmcv==0.6.2         |
+|        2.2.0        |        mmcv==0.6.2         |
+|        2.1.0        |   mmcv>=0.5.9, \<=0.6.1    |
+|        2.0.0        |   mmcv>=0.5.1, \<=0.5.8    |
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'".
+
+  1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`.
+  2. Install mmcv-full following the [installation instruction](get_started#best-practices).
+
+- Using albumentations
+
+  If you would like to use `albumentations`, we suggest using `pip install -r requirements/albu.txt` or
+  `pip install -U albumentations --no-binary qudida,albumentations`.
+  If you simply use `pip install albumentations>=0.3.2`, it will install `opencv-python-headless` simultaneously (even though you have already installed `opencv-python`).
+  Please refer to the [official documentation](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies) for details.
+
+- ModuleNotFoundError is raised when using some algorithms
+
+  Some extra dependencies are required for Instaboost, Panoptic Segmentation, LVIS dataset, etc. Please note the error message and install corresponding packages, e.g.,
+
+  ```shell
+  # for instaboost
+  pip install instaboostfast
+  # for panoptic segmentation
+  pip install git+https://github.com/cocodataset/panopticapi.git
+  # for LVIS dataset
+  pip install git+https://github.com/lvis-dataset/lvis-api.git
+  ```
+
+## Coding
+
+- Do I need to reinstall mmdet after some code modifications
+
+  If you follow the best practice and install mmdet with `pip install -e .`, any local modifications made to the code will take effect without reinstallation.
+
+- How to develop with multiple MMDetection versions
+
+  You can have multiple folders like mmdet-2.21, mmdet-2.22.
+  When you run the train or test script, it will adopt the mmdet package in the current folder.
+
+  To use the default MMDetection installed in the environment rather than the one you are working with, you can remove the following line in those scripts:
+
+  ```shell
+  PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+  ```
+
+## PyTorch/CUDA Environment
+
+- "RTX 30 series card fails when building MMCV or MMDet"
+
+  1. Temporary work-around: do `MMCV_WITH_OPS=1 MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80' pip install -e .`.
+     The common issue is `nvcc fatal : Unsupported gpu architecture 'compute_86'`. This means that the compiler should optimize for sm_86, i.e., nvidia 30 series card, but such optimizations have not been supported by CUDA toolkit 11.0.
+     This work-around modifies the compile flag by adding `MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80'`, which tells `nvcc` to optimize for **sm_80**, i.e., Nvidia A100. Although A100 is different from the 30 series card, they use similar ampere architecture. This may hurt the performance but it works.
+  2. PyTorch developers have updated that the default compiler flags should be fixed by [pytorch/pytorch#47585](https://github.com/pytorch/pytorch/pull/47585). So using PyTorch-nightly may also be able to solve the problem, though we have not tested it yet.
+
+- "invalid device function" or "no kernel image is available for execution".
+
+  1. Check if your cuda runtime version (under `/usr/local/`), `nvcc --version` and `conda list cudatoolkit` version match.
+  2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture.
+     You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV.
+     The GPU arch table could be found [here](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list),
+     i.e. run `TORCH_CUDA_ARCH_LIST=7.0 pip install mmcv-full` to build MMCV for Volta GPUs.
+     The compatibility issue could happen when using old GPUS, e.g., Tesla K80 (3.7) on colab.
+  3. Check whether the running environment is the same as that when mmcv/mmdet has compiled.
+     For example, you may compile mmcv using CUDA 10.0 but run it on CUDA 9.0 environments.
+
+- "undefined symbol" or "cannot open xxx.so".
+
+  1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check whether the CUDA/GCC runtimes are the same as those used for compiling mmcv,
+     i.e. run `python mmdet/utils/collect_env.py` to see if `"MMCV Compiler"`/`"MMCV CUDA Compiler"` is the same as `"GCC"`/`"CUDA_HOME"`.
+  2. If those symbols are PyTorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the PyTorch version is the same as that used for compiling mmcv.
+  3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment.
+
+- setuptools.sandbox.UnpickleableException: DistutilsSetupError("each element of 'ext_modules' option must be an Extension instance or 2-tuple")
+
+  1. If you are using miniconda rather than anaconda, check whether Cython is installed as indicated in [#3379](https://github.com/open-mmlab/mmdetection/issues/3379).
+     You need to manually install Cython first and then run command `pip install -r requirements.txt`.
+  2. You may also need to check the compatibility between the `setuptools`, `Cython`, and `PyTorch` in your environment.
+
+- "Segmentation fault".
+
+  1. Check you GCC version and use GCC 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem.
+
+  2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal.
+
+     ```shell
+     python -c 'import torch; print(torch.cuda.is_available())'
+     ```
+
+     And see whether they could correctly output results.
+
+  3. If Pytorch is correctly installed, check whether MMCV is correctly installed.
+
+     ```shell
+     python -c 'import mmcv; import mmcv.ops'
+     ```
+
+     If MMCV is correctly installed, then there will be no issue of the above two commands.
+
+  4. If MMCV and Pytorch is correctly installed, you man use `ipdb`, `pdb` to set breakpoints or directly add 'print' in mmdetection code and see which part leads the segmentation fault.
+
+## Training
+
+- "Loss goes Nan"
+
+  1. Check if the dataset annotations are valid: zero-size bounding boxes will cause the regression loss to be Nan due to the commonly used transformation for box regression. Some small size (width or height are smaller than 1) boxes will also cause this problem after data augmentation (e.g., instaboost). So check the data and try to filter out those zero-size boxes and skip some risky augmentations on the small-size boxes when you face the problem.
+  2. Reduce the learning rate: the learning rate might be too large due to some reasons, e.g., change of batch size. You can rescale them to the value that could stably train the model.
+  3. Extend the warmup iterations: some models are sensitive to the learning rate at the start of the training. You can extend the warmup iterations, e.g., change the `warmup_iters` from 500 to 1000 or 2000.
+  4. Add gradient clipping: some models requires gradient clipping to stabilize the training process. The default of `grad_clip` is `None`, you can add gradient clippint to avoid gradients that are too large, i.e., set `optimizer_config=dict(_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))` in your config file. If your config does not inherits from any basic config that contains `optimizer_config=dict(grad_clip=None)`, you can simply add `optimizer_config=dict(grad_clip=dict(max_norm=35, norm_type=2))`.
+
+- "GPU out of memory"
+
+  1. There are some scenarios when there are large amount of ground truth boxes, which may cause OOM during target assignment. You can set `gpu_assign_thr=N` in the config of assigner thus the assigner will calculate box overlaps through CPU when there are more than N GT boxes.
+
+  2. Set `with_cp=True` in the backbone. This uses the sublinear strategy in PyTorch to reduce GPU memory cost in the backbone.
+
+  3. Try mixed precision training using following the examples in `config/fp16`. The `loss_scale` might need further tuning for different models.
+
+  4. Try to use `AvoidCUDAOOM` to avoid GPU out of memory. It will first retry after calling `torch.cuda.empty_cache()`. If it still fails, it will then retry by converting the type of inputs to FP16 format. If it still fails, it will try to copy inputs from GPUs to CPUs to continue computing. Try AvoidOOM in you code to make the code continue to run when GPU memory runs out:
+
+     ```python
+     from mmdet.utils import AvoidCUDAOOM
+
+     output = AvoidCUDAOOM.retry_if_cuda_oom(some_function)(input1, input2)
+     ```
+
+     You can also try `AvoidCUDAOOM` as a decorator to make the code continue to run when GPU memory runs out:
+
+     ```python
+     from mmdet.utils import AvoidCUDAOOM
+
+     @AvoidCUDAOOM.retry_if_cuda_oom
+     def function(*args, **kwargs):
+         ...
+         return xxx
+     ```
+
+- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
+
+  1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode.
+  2. You can set `find_unused_parameters = True` in the config to solve the above problems(but this will slow down the training speed.
+  3. If the version of your MMCV >= 1.4.1, you can get the name of those unused parameters with `detect_anomalous_params=True` in `optimizer_config` of config.
+
+- Save the best model
+
+  It can be turned on by configuring `evaluation = dict(save_best=‘auto’)`. In the case of the `auto` parameter, the first key in the returned evaluation result will be used as the basis for selecting the best model. You can also directly set the key in the evaluation result to manually set it, for example, `evaluation = dict(save_best='mAP' )`.
+
+- Resume training with `ExpMomentumEMAHook`
+
+  If you use `ExpMomentumEMAHook` in training, you can't just use command line parameters  `--resume-from` nor `--cfg-options resume_from` to restore model parameters during resume, i.e., the command `python tools/train.py configs/yolox/yolox_s_8x8_300e_coco.py --resume-from ./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth` will not work. Since `ExpMomentumEMAHook` needs to reload the weights, taking the `yolox_s` algorithm as an example, you should modify the values of `resume_from` in two places of the config as below:
+
+  ```python
+  # Open configs/yolox/yolox_s_8x8_300e_coco.py directly and modify all resume_from fields
+  resume_from=./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth
+  custom_hooks=[...
+      dict(
+          type='ExpMomentumEMAHook',
+          resume_from=./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth,
+          momentum=0.0001,
+          priority=49)
+      ]
+  ```
+
+## Evaluation
+
+- COCO Dataset, AP or AR = -1
+  1. According to the definition of COCO dataset, the small and medium areas in an image are less than 1024 (32\*32), 9216 (96\*96), respectively.
+  2. If the corresponding area has no object, the result of AP and AR will set to -1.
+
+## Model
+
+- `style` in ResNet
+
+  The `style` parameter in ResNet allows either `pytorch` or `caffe` style. It indicates the difference in the Bottleneck module. Bottleneck is a stacking structure of `1x1-3x3-1x1` convolutional layers. In the case of `caffe` mode, the convolution layer with `stride=2` is the first `1x1` convolution, while in `pyorch` mode, it is the second `3x3` convolution has `stride=2`. A sample code is as below:
+
+  ```python
+  if self.style == 'pytorch':
+        self.conv1_stride = 1
+        self.conv2_stride = stride
+  else:
+        self.conv1_stride = stride
+        self.conv2_stride = 1
+  ```
+
+- ResNeXt parameter description
+
+  ResNeXt comes from the paper [`Aggregated Residual Transformations for Deep Neural Networks`](https://arxiv.org/abs/1611.05431). It introduces  group and uses “cardinality” to control the number of groups to achieve a balance between accuracy and complexity. It controls the basic width and grouping parameters of the internal Bottleneck module through two hyperparameters `baseWidth` and `cardinality`. An example configuration name in MMDetection is `mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py`, where `mask_rcnn` represents the algorithm using Mask R-CNN, `x101` represents the backbone network using ResNeXt-101, and `64x4d` represents that the bottleneck block has 64 group and each group has basic width of 4.
+
+- `norm_eval` in backbone
+
+  Since the detection model is usually large and the input image resolution is high, this will result in a small batch of the detection model, which will make the variance of the statistics calculated by BatchNorm during the training process very large and not as stable as the statistics obtained during the pre-training of the backbone network . Therefore, the `norm_eval=True` mode is generally used in training, and the BatchNorm statistics in the pre-trained backbone network are directly used. The few algorithms that use large batches are the `norm_eval=False` mode, such as NASFPN. For the backbone network without ImageNet pre-training and the batch is relatively small, you can consider using `SyncBN`.
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
new file mode 100755
index 0000000..b7c6066
--- /dev/null
+++ b/docs/en/get_started.md
@@ -0,0 +1,208 @@
+# Prerequisites
+
+In this section we demonstrate how to prepare an environment with PyTorch.
+
+MMDetection works on Linux, Windows and macOS. It requires Python 3.7+, CUDA 9.2+ and PyTorch 1.5+.
+
+```{note}
+If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](#installation). Otherwise, you can follow these steps for the preparation.
+```
+
+**Step 0.** Download and install Miniconda from the [official website](https://docs.conda.io/en/latest/miniconda.html).
+
+**Step 1.** Create a conda environment and activate it.
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
+```
+
+**Step 2.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/), e.g.
+
+On GPU platforms:
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+On CPU platforms:
+
+```shell
+conda install pytorch torchvision cpuonly -c pytorch
+```
+
+# Installation
+
+We recommend that users follow our best practices to install MMDetection. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information.
+
+## Best Practices
+
+**Step 0.** Install [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+pip install -U openmim
+mim install mmcv-full
+```
+
+**Step 1.** Install MMDetection.
+
+Case a: If you develop and run mmdet directly, install it from source:
+
+```shell
+git clone https://github.com/open-mmlab/mmdetection.git
+cd mmdetection
+pip install -v -e .
+# "-v" means verbose, or more output
+# "-e" means installing a project in editable mode,
+# thus any local modifications made to the code will take effect without reinstallation.
+```
+
+Case b: If you use mmdet as a dependency or third-party package, install it with pip:
+
+```shell
+pip install mmdet
+```
+
+## Verify the installation
+
+To verify whether MMDetection is installed correctly, we provide some sample codes to run an inference demo.
+
+**Step 1.** We need to download config and checkpoint files.
+
+```shell
+mim download mmdet --config yolov3_mobilenetv2_320_300e_coco --dest .
+```
+
+The downloading will take several seconds or more, depending on your network environment. When it is done, you will find two files `yolov3_mobilenetv2_320_300e_coco.py` and `yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth` in your current folder.
+
+**Step 2.** Verify the inference demo.
+
+Option (a). If you install mmdetection from source, just run the following command.
+
+```shell
+python demo/image_demo.py demo/demo.jpg yolov3_mobilenetv2_320_300e_coco.py yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth --device cpu --out-file result.jpg
+```
+
+You will see a new image `result.jpg` on your current folder, where bounding boxes are plotted on cars, benches, etc.
+
+Option (b). If you install mmdetection with pip, open you python interpreter and copy&paste the following codes.
+
+```python
+from mmdet.apis import init_detector, inference_detector
+
+config_file = 'yolov3_mobilenetv2_320_300e_coco.py'
+checkpoint_file = 'yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth'
+model = init_detector(config_file, checkpoint_file, device='cpu')  # or device='cuda:0'
+inference_detector(model, 'demo/demo.jpg')
+```
+
+You will see a list of arrays printed, indicating the detected bounding boxes.
+
+## Customize Installation
+
+### CUDA versions
+
+When installing PyTorch, you need to specify the version of CUDA. If you are not clear on which to choose, follow our recommendations:
+
+- For Ampere-based NVIDIA GPUs, such as GeForce 30 series and NVIDIA A100, CUDA 11 is a must.
+- For older NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 offers better compatibility and is more lightweight.
+
+Please make sure the GPU driver satisfies the minimum version requirements. See [this table](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions) for more information.
+
+```{note}
+Installing CUDA runtime libraries is enough if you follow our best practices, because no CUDA code will be compiled locally. However if you hope to compile MMCV from source or develop other CUDA operators, you need to install the complete CUDA toolkit from NVIDIA's [website](https://developer.nvidia.com/cuda-downloads), and its version should match the CUDA version of PyTorch. i.e., the specified version of cudatoolkit in `conda install` command.
+```
+
+### Install MMCV without MIM
+
+MMCV contains C++ and CUDA extensions, thus depending on PyTorch in a complex way. MIM solves such dependencies automatically and makes the installation easier. However, it is not a must.
+
+To install MMCV with pip instead of MIM, please follow [MMCV installation guides](https://mmcv.readthedocs.io/en/latest/get_started/installation.html). This requires manually specifying a find-url based on PyTorch version and its CUDA version.
+
+For example, the following command install mmcv-full built for PyTorch 1.10.x and CUDA 11.3.
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+```
+
+### Install on CPU-only platforms
+
+MMDetection can be built for CPU only environment. In CPU mode you can train (requires MMCV version >= 1.4.4), test or inference a model.
+
+However some functionalities are gone in this mode:
+
+- Deformable Convolution
+- Modulated Deformable Convolution
+- ROI pooling
+- Deformable ROI pooling
+- CARAFE
+- SyncBatchNorm
+- CrissCrossAttention
+- MaskedConv2d
+- Temporal Interlace Shift
+- nms_cuda
+- sigmoid_focal_loss_cuda
+- bbox_overlaps
+
+If you try to train/test/inference a model containing above ops, an error will be raised.
+The following table lists affected algorithms.
+
+|                        Operator                         |                                          Model                                           |
+| :-----------------------------------------------------: | :--------------------------------------------------------------------------------------: |
+| Deformable Convolution/Modulated Deformable Convolution | DCN、Guided Anchoring、RepPoints、CentripetalNet、VFNet、CascadeRPN、NAS-FCOS、DetectoRS |
+|                      MaskedConv2d                       |                                     Guided Anchoring                                     |
+|                         CARAFE                          |                                          CARAFE                                          |
+|                      SyncBatchNorm                      |                                         ResNeSt                                          |
+
+### Install on Google Colab
+
+[Google Colab](https://research.google.com/) usually has PyTorch installed,
+thus we only need to install MMCV and MMDetection with the following commands.
+
+**Step 1.** Install [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+!pip3 install openmim
+!mim install mmcv-full
+```
+
+**Step 2.** Install MMDetection from the source.
+
+```shell
+!git clone https://github.com/open-mmlab/mmdetection.git
+%cd mmdetection
+!pip install -e .
+```
+
+**Step 3.** Verification.
+
+```python
+import mmdet
+print(mmdet.__version__)
+# Example output: 2.23.0
+```
+
+```{note}
+Within Jupyter, the exclamation mark `!` is used to call external executables and `%cd` is a [magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd) to change the current working directory of Python.
+```
+
+### Using MMDetection with Docker
+
+We provide a [Dockerfile](https://github.com/open-mmlab/mmdetection/blob/master/docker/Dockerfile) to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >=19.03.
+
+```shell
+# build an image with PyTorch 1.6, CUDA 10.1
+# If you prefer other versions, just modified the Dockerfile
+docker build -t mmdetection docker/
+```
+
+Run it with
+
+```shell
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmdetection/data mmdetection
+```
+
+## Trouble shooting
+
+If you have some issues during the installation, please first view the [FAQ](faq.md) page.
+You may [open an issue](https://github.com/open-mmlab/mmdetection/issues/new/choose) on GitHub if no solution is found.
diff --git a/docs/en/index.rst b/docs/en/index.rst
new file mode 100755
index 0000000..0089c87
--- /dev/null
+++ b/docs/en/index.rst
@@ -0,0 +1,63 @@
+Welcome to MMDetection's documentation!
+=======================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Get Started
+
+   get_started.md
+   modelzoo_statistics.md
+   model_zoo.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Quick Run
+
+   1_exist_data_model.md
+   2_new_data_model.md
+   3_exist_data_new_model.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Tutorials
+
+   tutorials/index.rst
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Useful Tools and Scripts
+
+   useful_tools.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Notes
+
+   conventions.md
+   compatibility.md
+   projects.md
+   changelog.md
+   faq.md
+
+.. toctree::
+   :caption: Switch Language
+
+   switch_language.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API Reference
+
+   api.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Device Support
+
+   device/npu.md
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/en/make.bat b/docs/en/make.bat
new file mode 100755
index 0000000..922152e
--- /dev/null
+++ b/docs/en/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md
new file mode 100755
index 0000000..6e2f3b6
--- /dev/null
+++ b/docs/en/model_zoo.md
@@ -0,0 +1,362 @@
+# Benchmark and Model Zoo
+
+## Mirror sites
+
+We only use aliyun to maintain the model zoo since MMDetection V2.0. The model zoo of V1.x has been deprecated.
+
+## Common settings
+
+- All models were trained on `coco_2017_train`, and tested on the `coco_2017_val`.
+- We use distributed training.
+- All pytorch-style pretrained backbones on ImageNet are from PyTorch model zoo, caffe-style pretrained backbones are converted from the newly released model from detectron2.
+- For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows.
+- We report the inference time as the total time of network forwarding and post-processing, excluding the data loading time. Results are obtained with the script [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/benchmark.py) which computes the average time on 2000 images.
+
+## ImageNet Pretrained Models
+
+It is common to initialize from backbone models pre-trained on ImageNet classification task. All pre-trained  model links can be found at [open_mmlab](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json).  According to `img_norm_cfg` and source of weight, we can divide all the ImageNet  pre-trained  model weights into some cases:
+
+- TorchVision:  Corresponding to torchvision weight, including ResNet50, ResNet101. The `img_norm_cfg` is `dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)`.
+- Pycls:  Corresponding to [pycls](https://github.com/facebookresearch/pycls) weight, including RegNetX. The `img_norm_cfg` is `dict(   mean=[103.530, 116.280, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False)`.
+- MSRA styles: Corresponding to [MSRA](https://github.com/KaimingHe/deep-residual-networks) weights, including ResNet50_Caffe and ResNet101_Caffe. The `img_norm_cfg` is `dict(   mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)`.
+- Caffe2 styles:  Currently only contains ResNext101_32x8d. The `img_norm_cfg` is `dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)`.
+- Other styles: E.g SSD which corresponds to `img_norm_cfg` is `dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)` and YOLOv3 which corresponds to `img_norm_cfg` is `dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)`.
+
+The detailed table of the commonly used backbone models in MMDetection is listed below :
+
+| model            | source      | link                                                                                                                                                                                                   | description                                                                                                                                                                                                                                      |
+| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ResNet50         | TorchVision | [torchvision's ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth)                                                                                                                   | From [torchvision's ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth).                                                                                                                                                       |
+| ResNet101        | TorchVision | [torchvision's ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth)                                                                                                                 | From [torchvision's ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth).                                                                                                                                                     |
+| RegNetX          | Pycls       | [RegNetX_3.2gf](https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth), [RegNetX_800mf](https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth). etc. | From [pycls](https://github.com/facebookresearch/pycls).                                                                                                                                                                                         |
+| ResNet50_Caffe   | MSRA        | [MSRA's ResNet-50](https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth)                                                                                                    | Converted copy of [Detectron2's R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl) model. The original weight comes from [MSRA's original ResNet-50](https://github.com/KaimingHe/deep-residual-networks).    |
+| ResNet101_Caffe  | MSRA        | [MSRA's ResNet-101](https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth)                                                                                                  | Converted copy of [Detectron2's R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl) model. The original weight comes from [MSRA's original ResNet-101](https://github.com/KaimingHe/deep-residual-networks). |
+| ResNext101_32x8d | Caffe2      | [Caffe2 ResNext101_32x8d](https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth)                                                                                           | Converted copy of [Detectron2's X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl) model. The ResNeXt-101-32x8d model trained with Caffe2 at FB.                                                |
+
+## Baselines
+
+### RPN
+
+Please refer to [RPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/rpn) for details.
+
+### Faster R-CNN
+
+Please refer to [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn) for details.
+
+### Mask R-CNN
+
+Please refer to [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn) for details.
+
+### Fast R-CNN (with pre-computed proposals)
+
+Please refer to [Fast R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/fast_rcnn) for details.
+
+### RetinaNet
+
+Please refer to [RetinaNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet) for details.
+
+### Cascade R-CNN and Cascade Mask R-CNN
+
+Please refer to [Cascade R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/cascade_rcnn) for details.
+
+### Hybrid Task Cascade (HTC)
+
+Please refer to [HTC](https://github.com/open-mmlab/mmdetection/blob/master/configs/htc) for details.
+
+### SSD
+
+Please refer to [SSD](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd) for details.
+
+### Group Normalization (GN)
+
+Please refer to [Group Normalization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn) for details.
+
+### Weight Standardization
+
+Please refer to [Weight Standardization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn+ws) for details.
+
+### Deformable Convolution v2
+
+Please refer to [Deformable Convolutional Networks](https://github.com/open-mmlab/mmdetection/blob/master/configs/dcn) for details.
+
+### CARAFE: Content-Aware ReAssembly of FEatures
+
+Please refer to [CARAFE](https://github.com/open-mmlab/mmdetection/blob/master/configs/carafe) for details.
+
+### Instaboost
+
+Please refer to [Instaboost](https://github.com/open-mmlab/mmdetection/blob/master/configs/instaboost) for details.
+
+### Libra R-CNN
+
+Please refer to [Libra R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/libra_rcnn) for details.
+
+### Guided Anchoring
+
+Please refer to [Guided Anchoring](https://github.com/open-mmlab/mmdetection/blob/master/configs/guided_anchoring) for details.
+
+### FCOS
+
+Please refer to [FCOS](https://github.com/open-mmlab/mmdetection/blob/master/configs/fcos) for details.
+
+### FoveaBox
+
+Please refer to [FoveaBox](https://github.com/open-mmlab/mmdetection/blob/master/configs/foveabox) for details.
+
+### RepPoints
+
+Please refer to [RepPoints](https://github.com/open-mmlab/mmdetection/blob/master/configs/reppoints) for details.
+
+### FreeAnchor
+
+Please refer to [FreeAnchor](https://github.com/open-mmlab/mmdetection/blob/master/configs/free_anchor) for details.
+
+### Grid R-CNN (plus)
+
+Please refer to [Grid R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/grid_rcnn) for details.
+
+### GHM
+
+Please refer to [GHM](https://github.com/open-mmlab/mmdetection/blob/master/configs/ghm) for details.
+
+### GCNet
+
+Please refer to [GCNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/gcnet) for details.
+
+### HRNet
+
+Please refer to [HRNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/hrnet) for details.
+
+### Mask Scoring R-CNN
+
+Please refer to [Mask Scoring R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/ms_rcnn) for details.
+
+### Train from Scratch
+
+Please refer to [Rethinking ImageNet Pre-training](https://github.com/open-mmlab/mmdetection/blob/master/configs/scratch) for details.
+
+### NAS-FPN
+
+Please refer to [NAS-FPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/nas_fpn) for details.
+
+### ATSS
+
+Please refer to [ATSS](https://github.com/open-mmlab/mmdetection/blob/master/configs/atss) for details.
+
+### FSAF
+
+Please refer to [FSAF](https://github.com/open-mmlab/mmdetection/blob/master/configs/fsaf) for details.
+
+### RegNetX
+
+Please refer to [RegNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/regnet) for details.
+
+### Res2Net
+
+Please refer to [Res2Net](https://github.com/open-mmlab/mmdetection/blob/master/configs/res2net) for details.
+
+### GRoIE
+
+Please refer to [GRoIE](https://github.com/open-mmlab/mmdetection/blob/master/configs/groie) for details.
+
+### Dynamic R-CNN
+
+Please refer to [Dynamic R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/dynamic_rcnn) for details.
+
+### PointRend
+
+Please refer to [PointRend](https://github.com/open-mmlab/mmdetection/blob/master/configs/point_rend) for details.
+
+### DetectoRS
+
+Please refer to [DetectoRS](https://github.com/open-mmlab/mmdetection/blob/master/configs/detectors) for details.
+
+### Generalized Focal Loss
+
+Please refer to [Generalized Focal Loss](https://github.com/open-mmlab/mmdetection/blob/master/configs/gfl) for details.
+
+### CornerNet
+
+Please refer to [CornerNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/cornernet) for details.
+
+### YOLOv3
+
+Please refer to [YOLOv3](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolo) for details.
+
+### PAA
+
+Please refer to [PAA](https://github.com/open-mmlab/mmdetection/blob/master/configs/paa) for details.
+
+### SABL
+
+Please refer to [SABL](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl) for details.
+
+### CentripetalNet
+
+Please refer to [CentripetalNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/centripetalnet) for details.
+
+### ResNeSt
+
+Please refer to [ResNeSt](https://github.com/open-mmlab/mmdetection/blob/master/configs/resnest) for details.
+
+### DETR
+
+Please refer to [DETR](https://github.com/open-mmlab/mmdetection/blob/master/configs/detr) for details.
+
+### Deformable DETR
+
+Please refer to [Deformable DETR](https://github.com/open-mmlab/mmdetection/blob/master/configs/deformable_detr) for details.
+
+### AutoAssign
+
+Please refer to [AutoAssign](https://github.com/open-mmlab/mmdetection/blob/master/configs/autoassign) for details.
+
+### YOLOF
+
+Please refer to [YOLOF](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolof) for details.
+
+### Seesaw Loss
+
+Please refer to [Seesaw Loss](https://github.com/open-mmlab/mmdetection/blob/master/configs/seesaw_loss) for details.
+
+### CenterNet
+
+Please refer to [CenterNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/centernet) for details.
+
+### YOLOX
+
+Please refer to [YOLOX](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolox) for details.
+
+### PVT
+
+Please refer to [PVT](https://github.com/open-mmlab/mmdetection/blob/master/configs/pvt) for details.
+
+### SOLO
+
+Please refer to [SOLO](https://github.com/open-mmlab/mmdetection/blob/master/configs/solo) for details.
+
+### QueryInst
+
+Please refer to [QueryInst](https://github.com/open-mmlab/mmdetection/blob/master/configs/queryinst) for details.
+
+### PanopticFPN
+
+Please refer to [PanopticFPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/panoptic_fpn) for details.
+
+### MaskFormer
+
+Please refer to [MaskFormer](https://github.com/open-mmlab/mmdetection/blob/master/configs/maskformer) for details.
+
+### DyHead
+
+Please refer to [DyHead](https://github.com/open-mmlab/mmdetection/blob/master/configs/dyhead) for details.
+
+### Mask2Former
+
+Please refer to [Mask2Former](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask2former) for details.
+
+### Efficientnet
+
+Please refer to [Efficientnet](https://github.com/open-mmlab/mmdetection/blob/master/configs/efficientnet) for details.
+
+### RF-Next
+
+Please refer to [RF-Next](https://github.com/open-mmlab/mmdetection/blob/master/configs/rfnext) for details.
+
+### Other datasets
+
+We also benchmark some methods on [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/master/configs/pascal_voc), [Cityscapes](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes), [OpenImages](https://github.com/open-mmlab/mmdetection/blob/master/configs/openimages) and [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/master/configs/wider_face).
+
+### Pre-trained Models
+
+We also train [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn) and [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn) using ResNet-50 and [RegNetX-3.2G](https://github.com/open-mmlab/mmdetection/blob/master/configs/regnet) with multi-scale training and longer schedules. These models serve as strong pre-trained models for downstream tasks for convenience.
+
+## Speed benchmark
+
+### Training Speed benchmark
+
+We provide [analyze_logs.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/analyze_logs.py) to get average time of iteration in training. You can find examples in [Log Analysis](https://mmdetection.readthedocs.io/en/latest/useful_tools.html#log-analysis).
+
+We compare the training speed of Mask R-CNN with some other popular frameworks (The data is copied from [detectron2](https://github.com/facebookresearch/detectron2/blob/master/docs/notes/benchmarks.md/)).
+For mmdetection, we benchmark with [mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py), which should have the same setting with [mask_rcnn_R_50_FPN_noaug_1x.yaml](https://github.com/facebookresearch/detectron2/blob/master/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml) of detectron2.
+We also provide the [checkpoint](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_compare_20200518-10127928.pth) and [training log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_20200518_105755.log.json) for reference. The throughput is computed as the average throughput in iterations 100-500 to skip GPU warmup time.
+
+| Implementation                                                                         | Throughput (img/s) |
+| -------------------------------------------------------------------------------------- | ------------------ |
+| [Detectron2](https://github.com/facebookresearch/detectron2)                           | 62                 |
+| [MMDetection](https://github.com/open-mmlab/mmdetection)                               | 61                 |
+| [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/)          | 53                 |
+| [tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN) | 50                 |
+| [simpledet](https://github.com/TuSimple/simpledet/)                                    | 39                 |
+| [Detectron](https://github.com/facebookresearch/Detectron)                             | 19                 |
+| [matterport/Mask_RCNN](https://github.com/matterport/Mask_RCNN/)                       | 14                 |
+
+### Inference Speed Benchmark
+
+We provide [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/benchmark.py) to benchmark the inference latency.
+The script benchmarkes the model with 2000 images and calculates the average time ignoring first 5 times. You can change the output log interval (defaults: 50) by setting `LOG-INTERVAL`.
+
+```shell
+python tools/benchmark.py ${CONFIG} ${CHECKPOINT} [--log-interval $[LOG-INTERVAL]] [--fuse-conv-bn]
+```
+
+The latency of all models in our model zoo is benchmarked without setting `fuse-conv-bn`, you can get a lower latency by setting it.
+
+## Comparison with Detectron2
+
+We compare mmdetection with [Detectron2](https://github.com/facebookresearch/detectron2.git) in terms of speed and performance.
+We use the commit id [185c27e](https://github.com/facebookresearch/detectron2/tree/185c27e4b4d2d4c68b5627b3765420c6d7f5a659)(30/4/2020) of detectron.
+For fair comparison, we install and run both frameworks on the same machine.
+
+### Hardware
+
+- 8 NVIDIA Tesla V100 (32G) GPUs
+- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+### Software environment
+
+- Python 3.7
+- PyTorch 1.4
+- CUDA 10.1
+- CUDNN 7.6.03
+- NCCL 2.4.08
+
+### Performance
+
+| Type                                                                                                                                   | Lr schd | Detectron2                                                                                                                             | mmdetection | Download                                                                                                                                                                                                                                                                                                                                                         |
+| -------------------------------------------------------------------------------------------------------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py) | 1x      | [37.9](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml)                 | 38.0        | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-5324cff8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco_20200429_234554.log.json)             |
+| [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py)  | 1x      | [38.6 & 35.2](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml) | 38.8 & 35.4 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco-dbecf295.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco_20200430_054239.log.json) |
+| [Retinanet](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py)        | 1x      | [36.5](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml)                   | 37.0        | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco-586977a0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco_20200430_014748.log.json)                     |
+
+### Training Speed
+
+The training speed is measure with s/iter. The lower, the better.
+
+| Type         | Detectron2 | mmdetection |
+| ------------ | ---------- | ----------- |
+| Faster R-CNN | 0.210      | 0.216       |
+| Mask R-CNN   | 0.261      | 0.265       |
+| Retinanet    | 0.200      | 0.205       |
+
+### Inference Speed
+
+The inference speed is measured with fps (img/s) on a single GPU, the higher, the better.
+To be consistent with Detectron2, we report the pure inference speed (without the time of data loading).
+For Mask R-CNN, we exclude the time of RLE encoding in post-processing.
+We also include the officially reported speed in the parentheses, which is slightly higher
+than the results tested on our server due to differences of hardwares.
+
+| Type         | Detectron2  | mmdetection |
+| ------------ | ----------- | ----------- |
+| Faster R-CNN | 25.6 (26.3) | 22.2        |
+| Mask R-CNN   | 22.5 (23.3) | 19.6        |
+| Retinanet    | 17.8 (18.2) | 20.6        |
+
+### Training memory
+
+| Type         | Detectron2 | mmdetection |
+| ------------ | ---------- | ----------- |
+| Faster R-CNN | 3.0        | 3.8         |
+| Mask R-CNN   | 3.4        | 3.9         |
+| Retinanet    | 3.9        | 3.4         |
diff --git a/docs/en/projects.md b/docs/en/projects.md
new file mode 100755
index 0000000..fa8ecb7
--- /dev/null
+++ b/docs/en/projects.md
@@ -0,0 +1,58 @@
+# Projects based on MMDetection
+
+There are many projects built upon MMDetection.
+We list some of them as examples of how to extend MMDetection for your own projects.
+As the page might not be completed, please feel free to create a PR to update this page.
+
+## Projects as an extension
+
+Some projects extend the boundary of MMDetection for deployment or other research fields.
+They reveal the potential of what MMDetection can do. We list several of them as below.
+
+- [OTEDetection](https://github.com/opencv/mmdetection): OpenVINO training extensions for object detection.
+- [MMDetection3d](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+
+## Projects of papers
+
+There are also projects released with papers.
+Some of the papers are published in top-tier conferences (CVPR, ICCV, and ECCV), the others are also highly influential.
+To make this list also a reference for the community to develop and compare new object detection algorithms, we list them following the time order of top-tier conferences.
+Methods already supported and maintained by MMDetection are not listed.
+
+- Anchor Pruning for Object Detection, CVIU 2022. [\[paper\]](https://doi.org/10.1016/j.cviu.2022.103445)[\[github\]](https://github.com/Mxbonn/anchor_pruning)
+- Involution: Inverting the Inherence of Convolution for Visual Recognition, CVPR21. [\[paper\]](https://arxiv.org/abs/2103.06255)[\[github\]](https://github.com/d-li14/involution)
+- Multiple Instance Active Learning for Object Detection, CVPR 2021. [\[paper\]](https://openaccess.thecvf.com/content/CVPR2021/papers/Yuan_Multiple_Instance_Active_Learning_for_Object_Detection_CVPR_2021_paper.pdf)[\[github\]](https://github.com/yuantn/MI-AOD)
+- Adaptive Class Suppression Loss for Long-Tail Object Detection, CVPR 2021. [\[paper\]](https://arxiv.org/abs/2104.00885)[\[github\]](https://github.com/CASIA-IVA-Lab/ACSL)
+- Generalizable Pedestrian Detection: The Elephant In The Room, CVPR2021. [\[paper\]](https://arxiv.org/abs/2003.08799)[\[github\]](https://github.com/hasanirtiza/Pedestron)
+- Group Fisher Pruning for Practical Network Compression, ICML2021. [\[paper\]](https://github.com/jshilong/FisherPruning/blob/main/resources/paper.pdf)[\[github\]](https://github.com/jshilong/FisherPruning)
+- Overcoming Classifier Imbalance for Long-tail Object Detection with Balanced Group Softmax, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Li_Overcoming_Classifier_Imbalance_for_Long-Tail_Object_Detection_With_Balanced_Group_CVPR_2020_paper.pdf)[\[github\]](https://github.com/FishYuLi/BalancedGroupSoftmax)
+- Coherent Reconstruction of Multiple Humans from a Single Image, CVPR2020. [\[paper\]](https://jiangwenpl.github.io/multiperson/)[\[github\]](https://github.com/JiangWenPL/multiperson)
+- Look-into-Object: Self-supervised Structure Modeling for Object Recognition, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhou_Look-Into-Object_Self-Supervised_Structure_Modeling_for_Object_Recognition_CVPR_2020_paper.pdf)[\[github\]](https://github.com/JDAI-CV/LIO)
+- Video Panoptic Segmentation, CVPR2020. [\[paper\]](https://arxiv.org/abs/2006.11339)[\[github\]](https://github.com/mcahny/vps)
+- D2Det: Towards High Quality Object Detection and Instance Segmentation, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cao_D2Det_Towards_High_Quality_Object_Detection_and_Instance_Segmentation_CVPR_2020_paper.html)[\[github\]](https://github.com/JialeCao001/D2Det)
+- CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.09119)[\[github\]](https://github.com/KiveeDong/CentripetalNet)
+- Learning a Unified Sample Weighting Network for Object Detection, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cai_Learning_a_Unified_Sample_Weighting_Network_for_Object_Detection_CVPR_2020_paper.html)[\[github\]](https://github.com/caiqi/sample-weighting-network)
+- Scale-equalizing Pyramid Convolution for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2005.03101) [\[github\]](https://github.com/jshilong/SEPC)
+- Revisiting the Sibling Head in Object Detector, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.07540)[\[github\]](https://github.com/Sense-X/TSD)
+- PolarMask: Single Shot Instance Segmentation with Polar Representation, CVPR2020. [\[paper\]](https://arxiv.org/abs/1909.13226)[\[github\]](https://github.com/xieenze/PolarMask)
+- Hit-Detector: Hierarchical Trinity Architecture Search for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.11818)[\[github\]](https://github.com/ggjy/HitDet.pytorch)
+- ZeroQ: A Novel Zero Shot Quantization Framework, CVPR2020. [\[paper\]](https://arxiv.org/abs/2001.00281)[\[github\]](https://github.com/amirgholami/ZeroQ)
+- CBNet: A Novel Composite Backbone Network Architecture for Object Detection, AAAI2020. [\[paper\]](https://aaai.org/Papers/AAAI/2020GB/AAAI-LiuY.1833.pdf)[\[github\]](https://github.com/VDIGPKU/CBNet)
+- RDSNet: A New Deep Architecture for Reciprocal Object Detection and Instance Segmentation, AAAI2020. [\[paper\]](https://arxiv.org/abs/1912.05070)[\[github\]](https://github.com/wangsr126/RDSNet)
+- Training-Time-Friendly Network for Real-Time Object Detection, AAAI2020. [\[paper\]](https://arxiv.org/abs/1909.00700)[\[github\]](https://github.com/ZJULearning/ttfnet)
+- Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution, NeurIPS 2019. [\[paper\]](https://arxiv.org/abs/1909.06720)[\[github\]](https://github.com/thangvubk/Cascade-RPN)
+- Reasoning R-CNN: Unifying Adaptive Global Reasoning into Large-scale Object Detection, CVPR2019. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2019/papers/Xu_Reasoning-RCNN_Unifying_Adaptive_Global_Reasoning_Into_Large-Scale_Object_Detection_CVPR_2019_paper.pdf)[\[github\]](https://github.com/chanyn/Reasoning-RCNN)
+- Learning RoI Transformer for Oriented Object Detection in Aerial Images, CVPR2019. [\[paper\]](https://arxiv.org/abs/1812.00155)[\[github\]](https://github.com/dingjiansw101/AerialDetection)
+- SOLO: Segmenting Objects by Locations. [\[paper\]](https://arxiv.org/abs/1912.04488)[\[github\]](https://github.com/WXinlong/SOLO)
+- SOLOv2: Dynamic, Faster and Stronger. [\[paper\]](https://arxiv.org/abs/2003.10152)[\[github\]](https://github.com/WXinlong/SOLO)
+- Dense Peppoints: Representing Visual Objects with Dense Point Sets. [\[paper\]](https://arxiv.org/abs/1912.11473)[\[github\]](https://github.com/justimyhxu/Dense-RepPoints)
+- IterDet: Iterative Scheme for Object Detection in Crowded Environments. [\[paper\]](https://arxiv.org/abs/2005.05708)[\[github\]](https://github.com/saic-vul/iterdet)
+- Cross-Iteration Batch Normalization. [\[paper\]](https://arxiv.org/abs/2002.05712)[\[github\]](https://github.com/Howal/Cross-iterationBatchNorm)
+- A Ranking-based, Balanced Loss Function Unifying Classification and Localisation in Object Detection, NeurIPS2020 [\[paper\]](https://arxiv.org/abs/2009.13592)[\[github\]](https://github.com/kemaloksuz/aLRPLoss)
+- RelationNet++: Bridging Visual Representations for Object Detection via Transformer Decoder, NeurIPS2020 [\[paper\]](https://arxiv.org/abs/2010.15831)[\[github\]](https://github.com/microsoft/RelationNet2)
+- Generalized Focal Loss V2: Learning Reliable Localization Quality Estimation for Dense Object Detection, CVPR2021[\[paper\]](https://arxiv.org/abs/2011.12885)[\[github\]](https://github.com/implus/GFocalV2)
+- Swin Transformer: Hierarchical Vision Transformer using Shifted Windows, ICCV2021[\[paper\]](https://arxiv.org/abs/2103.14030)[\[github\]](https://github.com/SwinTransformer/)
+- Focal Transformer: Focal Self-attention for Local-Global Interactions in Vision Transformers, NeurIPS2021[\[paper\]](https://arxiv.org/abs/2107.00641)[\[github\]](https://github.com/microsoft/Focal-Transformer)
+- End-to-End Semi-Supervised Object Detection with Soft Teacher, ICCV2021[\[paper\]](https://arxiv.org/abs/2106.09018)[\[github\]](https://github.com/microsoft/SoftTeacher)
+- CBNetV2: A Novel Composite Backbone Network Architecture for Object Detection [\[paper\]](http://arxiv.org/abs/2107.00420)[\[github\]](https://github.com/VDIGPKU/CBNetV2)
+- Instances as Queries, ICCV2021 [\[paper\]](https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Instances_As_Queries_ICCV_2021_paper.pdf)[\[github\]](https://github.com/hustvl/QueryInst)
diff --git a/docs/en/robustness_benchmarking.md b/docs/en/robustness_benchmarking.md
new file mode 100755
index 0000000..bb624ee
--- /dev/null
+++ b/docs/en/robustness_benchmarking.md
@@ -0,0 +1,110 @@
+# Corruption Benchmarking
+
+## Introduction
+
+We provide tools to test object detection and instance segmentation models on the image corruption benchmark defined in [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484).
+This page provides basic tutorials how to use the benchmark.
+
+```latex
+@article{michaelis2019winter,
+  title={Benchmarking Robustness in Object Detection:
+    Autonomous Driving when Winter is Coming},
+  author={Michaelis, Claudio and Mitzkus, Benjamin and
+    Geirhos, Robert and Rusak, Evgenia and
+    Bringmann, Oliver and Ecker, Alexander S. and
+    Bethge, Matthias and Brendel, Wieland},
+  journal={arXiv:1907.07484},
+  year={2019}
+}
+```
+
+![image corruption example](../resources/corruptions_sev_3.png)
+
+## About the benchmark
+
+To submit results to the benchmark please visit the [benchmark homepage](https://github.com/bethgelab/robust-detection-benchmark)
+
+The benchmark is modelled after the [imagenet-c benchmark](https://github.com/hendrycks/robustness) which was originally
+published in [Benchmarking Neural Network Robustness to Common Corruptions and Perturbations](https://arxiv.org/abs/1903.12261) (ICLR 2019) by Dan Hendrycks and Thomas Dietterich.
+
+The image corruption functions are included in this library but can be installed separately using:
+
+```shell
+pip install imagecorruptions
+```
+
+Compared to imagenet-c a few changes had to be made to handle images of arbitrary size and greyscale images.
+We also modified the 'motion blur' and 'snow' corruptions to remove dependency from a linux specific library,
+which would have to be installed separately otherwise. For details please refer to the [imagecorruptions repository](https://github.com/bethgelab/imagecorruptions).
+
+## Inference with pretrained models
+
+We provide a testing script to evaluate a models performance on any combination of the corruptions provided in the benchmark.
+
+### Test a dataset
+
+- [x] single GPU testing
+- [ ] multiple GPU testing
+- [ ] visualize detection results
+
+You can use the following commands to test a models performance under the 15 corruptions used in the benchmark.
+
+```shell
+# single-gpu testing
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
+```
+
+Alternatively different group of corruptions can be selected.
+
+```shell
+# noise
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions noise
+
+# blur
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions blur
+
+# wetaher
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions weather
+
+# digital
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions digital
+```
+
+Or a costom set of corruptions e.g.:
+
+```shell
+# gaussian noise, zoom blur and snow
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions gaussian_noise zoom_blur snow
+```
+
+Finally the corruption severities to evaluate can be chosen.
+Severity 0 corresponds to clean data and the effect increases from 1 to 5.
+
+```shell
+# severity 1
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 1
+
+# severities 0,2,4
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 0 2 4
+```
+
+## Results for modelzoo models
+
+The results on COCO 2017val are shown in the below table.
+
+|        Model        |      Backbone       |  Style  | Lr schd | box AP clean | box AP corr. | box % | mask AP clean | mask AP corr. | mask % |
+| :-----------------: | :-----------------: | :-----: | :-----: | :----------: | :----------: | :---: | :-----------: | :-----------: | :----: |
+|    Faster R-CNN     |      R-50-FPN       | pytorch |   1x    |     36.3     |     18.2     | 50.2  |       -       |       -       |   -    |
+|    Faster R-CNN     |      R-101-FPN      | pytorch |   1x    |     38.5     |     20.9     | 54.2  |       -       |       -       |   -    |
+|    Faster R-CNN     |   X-101-32x4d-FPN   | pytorch |   1x    |     40.1     |     22.3     | 55.5  |       -       |       -       |   -    |
+|    Faster R-CNN     |   X-101-64x4d-FPN   | pytorch |   1x    |     41.3     |     23.4     | 56.6  |       -       |       -       |   -    |
+|    Faster R-CNN     |    R-50-FPN-DCN     | pytorch |   1x    |     40.0     |     22.4     | 56.1  |       -       |       -       |   -    |
+|    Faster R-CNN     | X-101-32x4d-FPN-DCN | pytorch |   1x    |     43.4     |     26.7     | 61.6  |       -       |       -       |   -    |
+|     Mask R-CNN      |      R-50-FPN       | pytorch |   1x    |     37.3     |     18.7     | 50.1  |     34.2      |     16.8      |  49.1  |
+|     Mask R-CNN      |    R-50-FPN-DCN     | pytorch |   1x    |     41.1     |     23.3     | 56.7  |     37.2      |     20.7      |  55.7  |
+|    Cascade R-CNN    |      R-50-FPN       | pytorch |   1x    |     40.4     |     20.1     | 49.7  |       -       |       -       |   -    |
+| Cascade Mask R-CNN  |      R-50-FPN       | pytorch |   1x    |     41.2     |     20.7     | 50.2  |     35.7      |     17.6      |  49.3  |
+|      RetinaNet      |      R-50-FPN       | pytorch |   1x    |     35.6     |     17.8     | 50.1  |       -       |       -       |   -    |
+| Hybrid Task Cascade | X-101-64x4d-FPN-DCN | pytorch |   1x    |     50.6     |     32.7     | 64.7  |     43.8      |     28.1      |  64.0  |
+
+Results may vary slightly due to the stochastic application of the corruptions.
diff --git a/docs/en/stat.py b/docs/en/stat.py
new file mode 100755
index 0000000..427c27b
--- /dev/null
+++ b/docs/en/stat.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import functools as func
+import glob
+import os.path as osp
+import re
+
+import numpy as np
+
+url_prefix = 'https://github.com/open-mmlab/mmdetection/blob/master/configs'
+
+files = sorted(glob.glob('../../configs/*/README.md'))
+
+stats = []
+titles = []
+num_ckpts = 0
+
+for f in files:
+    url = osp.dirname(f.replace('../../configs', url_prefix))
+
+    with open(f, 'r') as content_file:
+        content = content_file.read()
+
+    title = content.split('\n')[0].replace('# ', '').strip()
+    ckpts = set(x.lower().strip()
+                for x in re.findall(r'\[model\]\((https?.*)\)', content))
+
+    if len(ckpts) == 0:
+        continue
+
+    _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)]
+    assert len(_papertype) > 0
+    papertype = _papertype[0]
+
+    paper = set([(papertype, title)])
+
+    titles.append(title)
+    num_ckpts += len(ckpts)
+
+    statsmsg = f"""
+\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts)
+"""
+    stats.append((paper, ckpts, statsmsg))
+
+allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats])
+msglist = '\n'.join(x for _, _, x in stats)
+
+papertypes, papercounts = np.unique([t for t, _ in allpapers],
+                                    return_counts=True)
+countstr = '\n'.join(
+    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+
+modelzoo = f"""
+# Model Zoo Statistics
+
+* Number of papers: {len(set(titles))}
+{countstr}
+
+* Number of checkpoints: {num_ckpts}
+
+{msglist}
+"""
+
+with open('modelzoo_statistics.md', 'w') as f:
+    f.write(modelzoo)
diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md
new file mode 100755
index 0000000..b2c4ad9
--- /dev/null
+++ b/docs/en/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmdetection.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmdetection.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/en/tutorials/config.md b/docs/en/tutorials/config.md
new file mode 100755
index 0000000..8fd37db
--- /dev/null
+++ b/docs/en/tutorials/config.md
@@ -0,0 +1,551 @@
+# Tutorial 1: Learn about Configs
+
+We incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
+If you wish to inspect the config file, you may run `python tools/misc/print_config.py /PATH/TO/CONFIG` to see the complete config.
+
+## Modify config through script arguments
+
+When submitting jobs using "tools/train.py" or "tools/test.py", you may specify `--cfg-options` to in-place modify the config.
+
+- Update config keys of dict chains.
+
+  The config options can be specified following the order of the dict keys in the original config.
+  For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode.
+
+- Update keys inside a list of configs.
+
+  Some config dicts are composed as a list in your config. For example, the training pipeline `data.train.pipeline` is normally a list
+  e.g. `[dict(type='LoadImageFromFile'), ...]`. If you want to change `'LoadImageFromFile'` to `'LoadImageFromWebcam'` in the pipeline,
+  you may specify `--cfg-options data.train.pipeline.0.type=LoadImageFromWebcam`.
+
+- Update values of list/tuples.
+
+  If the value to be updated is a list or a tuple. For example, the config file normally sets `workflow=[('train', 1)]`. If you want to
+  change this key, you may specify `--cfg-options workflow="[(train,1),(val,1)]"`. Note that the quotation mark " is necessary to
+  support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value.
+
+## Config File Structure
+
+There are 4 basic component types under `config/_base_`, dataset, model, schedule, default_runtime.
+Many methods could be easily constructed with one of each like Faster R-CNN, Mask R-CNN, Cascade R-CNN, RPN, SSD.
+The configs that are composed by components from `_base_` are called _primitive_.
+
+For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3.
+
+For easy understanding, we recommend contributors to inherit from existing methods.
+For example, if some modification is made base on Faster R-CNN, user may first inherit the basic Faster R-CNN structure by specifying `_base_ = ../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py`, then modify the necessary fields in the config files.
+
+If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder `xxx_rcnn` under `configs`,
+
+Please refer to [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html) for detailed documentation.
+
+## Config Name Style
+
+We follow the below style to name config files. Contributors are advised to follow the same style.
+
+```
+{model}_[model setting]_{backbone}_{neck}_[norm setting]_[misc]_[gpu x batch_per_gpu]_{schedule}_{dataset}
+```
+
+`{xxx}` is required field and `[yyy]` is optional.
+
+- `{model}`: model type like `faster_rcnn`, `mask_rcnn`, etc.
+- `[model setting]`: specific setting for some model, like `without_semantic` for `htc`, `moment` for `reppoints`, etc.
+- `{backbone}`: backbone type like `r50` (ResNet-50), `x101` (ResNeXt-101).
+- `{neck}`: neck type like `fpn`, `pafpn`, `nasfpn`, `c4`.
+- `[norm_setting]`: `bn` (Batch Normalization) is used unless specified, other norm layer type could be `gn` (Group Normalization), `syncbn` (Synchronized Batch Normalization).
+  `gn-head`/`gn-neck` indicates GN is applied in head/neck only, while `gn-all` means GN is applied in the entire model, e.g. backbone, neck, head.
+- `[misc]`: miscellaneous setting/plugins of model, e.g. `dconv`, `gcb`, `attention`, `albu`, `mstrain`.
+- `[gpu x batch_per_gpu]`: GPUs and samples per GPU, `8x2` is used by default.
+- `{schedule}`: training schedule, options are `1x`, `2x`, `20e`, etc.
+  `1x` and `2x` means 12 epochs and 24 epochs respectively.
+  `20e` is adopted in cascade models, which denotes 20 epochs.
+  For `1x`/`2x`, initial learning rate decays by a factor of 10 at the 8/16th and 11/22th epochs.
+  For `20e`, initial learning rate decays by a factor of 10 at the 16th and 19th epochs.
+- `{dataset}`: dataset like `coco`, `cityscapes`, `voc_0712`, `wider_face`.
+
+## Deprecated train_cfg/test_cfg
+
+The `train_cfg` and `test_cfg` are deprecated in config file, please specify them in the model config. The original config structure is as below.
+
+```python
+# deprecated
+model = dict(
+    type=...,
+    ...
+)
+train_cfg=dict(...)
+test_cfg=dict(...)
+```
+
+The migration example is as below.
+
+```python
+# recommended
+model = dict(
+    type=...,
+    ...
+train_cfg=dict(...),
+          test_cfg=dict(...),
+)
+```
+
+## An Example of Mask R-CNN
+
+To help the users have a basic idea of a complete config and the modules in a modern detection system,
+we make brief comments on the config of Mask R-CNN using ResNet50 and FPN as the following.
+For more detailed usage and the corresponding alternative for each modules, please refer to the API documentation.
+
+```python
+model = dict(
+    type='MaskRCNN',  # The name of detector
+    backbone=dict(  # The config of backbone
+        type='ResNet',  # The type of the backbone, refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py#L308 for more details.
+        depth=50,  # The depth of backbone, usually it is 50 or 101 for ResNet and ResNext backbones.
+        num_stages=4,  # Number of stages of the backbone.
+        out_indices=(0, 1, 2, 3),  # The index of output feature maps produced in each stages
+        frozen_stages=1,  # The weights in the first 1 stage are frozen
+        norm_cfg=dict(  # The config of normalization layers.
+            type='BN',  # Type of norm layer, usually it is BN or GN
+            requires_grad=True),  # Whether to train the gamma and beta in BN
+        norm_eval=True,  # Whether to freeze the statistics in BN
+        style='pytorch', # The style of backbone, 'pytorch' means that stride 2 layers are in 3x3 conv, 'caffe' means stride 2 layers are in 1x1 convs.
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),  # The ImageNet pretrained backbone to be loaded
+    neck=dict(
+        type='FPN',  # The neck of detector is FPN. We also support 'NASFPN', 'PAFPN', etc. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/fpn.py#L10 for more details.
+        in_channels=[256, 512, 1024, 2048],  # The input channels, this is consistent with the output channels of backbone
+        out_channels=256,  # The output channels of each level of the pyramid feature map
+        num_outs=5),  # The number of output scales
+    rpn_head=dict(
+        type='RPNHead',  # The type of RPN head is 'RPNHead', we also support 'GARPNHead', etc. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/rpn_head.py#L12 for more details.
+        in_channels=256,  # The input channels of each input feature map, this is consistent with the output channels of neck
+        feat_channels=256,  # Feature channels of convolutional layers in the head.
+        anchor_generator=dict(  # The config of anchor generator
+            type='AnchorGenerator',  # Most of methods use AnchorGenerator, SSD Detectors uses `SSDAnchorGenerator`. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/anchor/anchor_generator.py#L10 for more details
+            scales=[8],  # Basic scale of the anchor, the area of the anchor in one position of a feature map will be scale * base_sizes
+            ratios=[0.5, 1.0, 2.0],  # The ratio between height and width.
+            strides=[4, 8, 16, 32, 64]),  # The strides of the anchor generator. This is consistent with the FPN feature strides. The strides will be taken as base_sizes if base_sizes is not set.
+        bbox_coder=dict(  # Config of box coder to encode and decode the boxes during training and testing
+            type='DeltaXYWHBBoxCoder',  # Type of box coder. 'DeltaXYWHBBoxCoder' is applied for most of methods. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py#L9 for more details.
+            target_means=[0.0, 0.0, 0.0, 0.0],  # The target means used to encode and decode boxes
+            target_stds=[1.0, 1.0, 1.0, 1.0]),  # The standard variance used to encode and decode boxes
+        loss_cls=dict(  # Config of loss function for the classification branch
+            type='CrossEntropyLoss',  # Type of loss for classification branch, we also support FocalLoss etc.
+            use_sigmoid=True,  # RPN usually perform two-class classification, so it usually uses sigmoid function.
+            loss_weight=1.0),  # Loss weight of the classification branch.
+        loss_bbox=dict(  # Config of loss function for the regression branch.
+            type='L1Loss',  # Type of loss, we also support many IoU Losses and smooth L1-loss, etc. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/smooth_l1_loss.py#L56 for implementation.
+            loss_weight=1.0)),  # Loss weight of the regression branch.
+    roi_head=dict(  # RoIHead encapsulates the second stage of two-stage/cascade detectors.
+        type='StandardRoIHead',  # Type of the RoI head. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/standard_roi_head.py#L10 for implementation.
+        bbox_roi_extractor=dict(  # RoI feature extractor for bbox regression.
+            type='SingleRoIExtractor',  # Type of the RoI feature extractor, most of methods uses SingleRoIExtractor. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/roi_extractors/single_level.py#L10 for details.
+            roi_layer=dict(  # Config of RoI Layer
+                type='RoIAlign',  # Type of RoI Layer, DeformRoIPoolingPack and ModulatedDeformRoIPoolingPack are also supported. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/roi_align/roi_align.py#L79 for details.
+                output_size=7,  # The output size of feature maps.
+                sampling_ratio=0),  # Sampling ratio when extracting the RoI features. 0 means adaptive ratio.
+            out_channels=256,  # output channels of the extracted feature.
+            featmap_strides=[4, 8, 16, 32]),  # Strides of multi-scale feature maps. It should be consistent to the architecture of the backbone.
+        bbox_head=dict(  # Config of box head in the RoIHead.
+            type='Shared2FCBBoxHead',  # Type of the bbox head, Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py#L177 for implementation details.
+            in_channels=256,  # Input channels for bbox head. This is consistent with the out_channels in roi_extractor
+            fc_out_channels=1024,  # Output feature channels of FC layers.
+            roi_feat_size=7,  # Size of RoI features
+            num_classes=80,  # Number of classes for classification
+            bbox_coder=dict(  # Box coder used in the second stage.
+                type='DeltaXYWHBBoxCoder',  # Type of box coder. 'DeltaXYWHBBoxCoder' is applied for most of methods.
+                target_means=[0.0, 0.0, 0.0, 0.0],  # Means used to encode and decode box
+                target_stds=[0.1, 0.1, 0.2, 0.2]),  # Standard variance for encoding and decoding. It is smaller since the boxes are more accurate. [0.1, 0.1, 0.2, 0.2] is a conventional setting.
+            reg_class_agnostic=False,  # Whether the regression is class agnostic.
+            loss_cls=dict(  # Config of loss function for the classification branch
+                type='CrossEntropyLoss',  # Type of loss for classification branch, we also support FocalLoss etc.
+                use_sigmoid=False,  # Whether to use sigmoid.
+                loss_weight=1.0),  # Loss weight of the classification branch.
+            loss_bbox=dict(  # Config of loss function for the regression branch.
+                type='L1Loss',  # Type of loss, we also support many IoU Losses and smooth L1-loss, etc.
+                loss_weight=1.0)),  # Loss weight of the regression branch.
+        mask_roi_extractor=dict(  # RoI feature extractor for mask generation.
+            type='SingleRoIExtractor',  # Type of the RoI feature extractor, most of methods uses SingleRoIExtractor.
+            roi_layer=dict(  # Config of RoI Layer that extracts features for instance segmentation
+                type='RoIAlign',  # Type of RoI Layer, DeformRoIPoolingPack and ModulatedDeformRoIPoolingPack are also supported
+                output_size=14,  # The output size of feature maps.
+                sampling_ratio=0),  # Sampling ratio when extracting the RoI features.
+            out_channels=256,  # Output channels of the extracted feature.
+            featmap_strides=[4, 8, 16, 32]),  # Strides of multi-scale feature maps.
+        mask_head=dict(  # Mask prediction head
+            type='FCNMaskHead',  # Type of mask head, refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py#L21 for implementation details.
+            num_convs=4,  # Number of convolutional layers in mask head.
+            in_channels=256,  # Input channels, should be consistent with the output channels of mask roi extractor.
+            conv_out_channels=256,  # Output channels of the convolutional layer.
+            num_classes=80,  # Number of class to be segmented.
+            loss_mask=dict(  # Config of loss function for the mask branch.
+                type='CrossEntropyLoss',  # Type of loss used for segmentation
+                use_mask=True,  # Whether to only train the mask in the correct class.
+                loss_weight=1.0))),  # Loss weight of mask branch.
+    train_cfg = dict(  # Config of training hyperparameters for rpn and rcnn
+        rpn=dict(  # Training config of rpn
+            assigner=dict(  # Config of assigner
+                type='MaxIoUAssigner',  # Type of assigner, MaxIoUAssigner is used for many common detectors. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/max_iou_assigner.py#L10 for more details.
+                pos_iou_thr=0.7,  # IoU >= threshold 0.7 will be taken as positive samples
+                neg_iou_thr=0.3,  # IoU < threshold 0.3 will be taken as negative samples
+                min_pos_iou=0.3,  # The minimal IoU threshold to take boxes as positive samples
+                match_low_quality=True,  # Whether to match the boxes under low quality (see API doc for more details).
+                ignore_iof_thr=-1),  # IoF threshold for ignoring bboxes
+            sampler=dict(  # Config of positive/negative sampler
+                type='RandomSampler',  # Type of sampler, PseudoSampler and other samplers are also supported. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/samplers/random_sampler.py#L8 for implementation details.
+                num=256,  # Number of samples
+                pos_fraction=0.5,  # The ratio of positive samples in the total samples.
+                neg_pos_ub=-1,  # The upper bound of negative samples based on the number of positive samples.
+                add_gt_as_proposals=False),  # Whether add GT as proposals after sampling.
+            allowed_border=-1,  # The border allowed after padding for valid anchors.
+            pos_weight=-1,  # The weight of positive samples during training.
+            debug=False),  # Whether to set the debug mode
+        rpn_proposal=dict(  # The config to generate proposals during training
+            nms_across_levels=False,  # Whether to do NMS for boxes across levels. Only work in `GARPNHead`, naive rpn does not support do nms cross levels.
+            nms_pre=2000,  # The number of boxes before NMS
+            nms_post=1000,  # The number of boxes to be kept by NMS, Only work in `GARPNHead`.
+            max_per_img=1000,  # The number of boxes to be kept after NMS.
+            nms=dict( # Config of NMS
+                type='nms',  # Type of NMS
+                iou_threshold=0.7 # NMS threshold
+                ),
+            min_bbox_size=0),  # The allowed minimal box size
+        rcnn=dict(  # The config for the roi heads.
+            assigner=dict(  # Config of assigner for second stage, this is different for that in rpn
+                type='MaxIoUAssigner',  # Type of assigner, MaxIoUAssigner is used for all roi_heads for now. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/max_iou_assigner.py#L10 for more details.
+                pos_iou_thr=0.5,  # IoU >= threshold 0.5 will be taken as positive samples
+                neg_iou_thr=0.5,  # IoU < threshold 0.5 will be taken as negative samples
+                min_pos_iou=0.5,  # The minimal IoU threshold to take boxes as positive samples
+                match_low_quality=False,  # Whether to match the boxes under low quality (see API doc for more details).
+                ignore_iof_thr=-1),  # IoF threshold for ignoring bboxes
+            sampler=dict(
+                type='RandomSampler',  # Type of sampler, PseudoSampler and other samplers are also supported. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/samplers/random_sampler.py#L8 for implementation details.
+                num=512,  # Number of samples
+                pos_fraction=0.25,  # The ratio of positive samples in the total samples.
+                neg_pos_ub=-1,  # The upper bound of negative samples based on the number of positive samples.
+                add_gt_as_proposals=True
+            ),  # Whether add GT as proposals after sampling.
+            mask_size=28,  # Size of mask
+            pos_weight=-1,  # The weight of positive samples during training.
+            debug=False)),  # Whether to set the debug mode
+    test_cfg = dict(  # Config for testing hyperparameters for rpn and rcnn
+        rpn=dict(  # The config to generate proposals during testing
+            nms_across_levels=False,  # Whether to do NMS for boxes across levels. Only work in `GARPNHead`, naive rpn does not support do nms cross levels.
+            nms_pre=1000,  # The number of boxes before NMS
+            nms_post=1000,  # The number of boxes to be kept by NMS, Only work in `GARPNHead`.
+            max_per_img=1000,  # The number of boxes to be kept after NMS.
+            nms=dict( # Config of NMS
+                type='nms',  #Type of NMS
+                iou_threshold=0.7 # NMS threshold
+                ),
+            min_bbox_size=0),  # The allowed minimal box size
+        rcnn=dict(  # The config for the roi heads.
+            score_thr=0.05,  # Threshold to filter out boxes
+            nms=dict(  # Config of NMS in the second stage
+                type='nms',  # Type of NMS
+                iou_thr=0.5),  # NMS threshold
+            max_per_img=100,  # Max number of detections of each image
+            mask_thr_binary=0.5)))  # Threshold of mask prediction
+
+dataset_type = 'CocoDataset'  # Dataset type, this will be used to define the dataset
+data_root = 'data/coco/'  # Root path of data
+img_norm_cfg = dict(  # Image normalization config to normalize the input images
+    mean=[123.675, 116.28, 103.53],  # Mean values used to pre-training the pre-trained backbone models
+    std=[58.395, 57.12, 57.375],  # Standard variance used to pre-training the pre-trained backbone models
+    to_rgb=True
+)  # The channel orders of image used to pre-training the pre-trained backbone models
+train_pipeline = [  # Training pipeline
+    dict(type='LoadImageFromFile'),  # First pipeline to load images from file path
+    dict(
+        type='LoadAnnotations',  # Second pipeline to load annotations for current image
+        with_bbox=True,  # Whether to use bounding box, True for detection
+        with_mask=True,  # Whether to use instance mask, True for instance segmentation
+        poly2mask=False),  # Whether to convert the polygon mask to instance mask, set False for acceleration and to save memory
+    dict(
+        type='Resize',  # Augmentation pipeline that resize the images and their annotations
+        img_scale=(1333, 800),  # The largest scale of image
+        keep_ratio=True
+    ),  # whether to keep the ratio between height and width.
+    dict(
+        type='RandomFlip',  # Augmentation pipeline that flip the images and their annotations
+        flip_ratio=0.5),  # The ratio or probability to flip
+    dict(
+        type='Normalize',  # Augmentation pipeline that normalize the input images
+        mean=[123.675, 116.28, 103.53],  # These keys are the same of img_norm_cfg since the
+        std=[58.395, 57.12, 57.375],  # keys of img_norm_cfg are used here as arguments
+        to_rgb=True),
+    dict(
+        type='Pad',  # Padding config
+        size_divisor=32),  # The number the padded images should be divisible
+    dict(type='DefaultFormatBundle'),  # Default format bundle to gather data in the pipeline
+    dict(
+        type='Collect',  # Pipeline that decides which keys in the data should be passed to the detector
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),  # First pipeline to load images from file path
+    dict(
+        type='MultiScaleFlipAug',  # An encapsulation that encapsulates the testing augmentations
+        img_scale=(1333, 800),  # Decides the largest scale for testing, used for the Resize pipeline
+        flip=False,  # Whether to flip images during testing
+        transforms=[
+            dict(type='Resize',  # Use resize augmentation
+                 keep_ratio=True),  # Whether to keep the ratio between height and width, the img_scale set here will be suppressed by the img_scale set above.
+            dict(type='RandomFlip'),  # Thought RandomFlip is added in pipeline, it is not used because flip=False
+            dict(
+                type='Normalize',  # Normalization config, the values are from img_norm_cfg
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(
+                type='Pad',  # Padding config to pad images divisible by 32.
+                size_divisor=32),
+            dict(
+                type='ImageToTensor',  # convert image to tensor
+                keys=['img']),
+            dict(
+                type='Collect',  # Collect pipeline that collect necessary keys for testing.
+                keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=2,  # Batch size of a single GPU
+    workers_per_gpu=2,  # Worker to pre-fetch data for each single GPU
+    train=dict(  # Train dataset config
+        type='CocoDataset',  # Type of dataset, refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py#L19 for details.
+        ann_file='data/coco/annotations/instances_train2017.json',  # Path of annotation file
+        img_prefix='data/coco/train2017/',  # Prefix of image path
+        pipeline=[  # pipeline, this is passed by the train_pipeline created before.
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='LoadAnnotations',
+                with_bbox=True,
+                with_mask=True,
+                poly2mask=False),
+            dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+        ]),
+    val=dict(  # Validation dataset config
+        type='CocoDataset',
+        ann_file='data/coco/annotations/instances_val2017.json',
+        img_prefix='data/coco/val2017/',
+        pipeline=[  # Pipeline is passed by test_pipeline created before
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(1333, 800),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip'),
+                    dict(
+                        type='Normalize',
+                        mean=[123.675, 116.28, 103.53],
+                        std=[58.395, 57.12, 57.375],
+                        to_rgb=True),
+                    dict(type='Pad', size_divisor=32),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(  # Test dataset config, modify the ann_file for test-dev/test submission
+        type='CocoDataset',
+        ann_file='data/coco/annotations/instances_val2017.json',
+        img_prefix='data/coco/val2017/',
+        pipeline=[  # Pipeline is passed by test_pipeline created before
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(1333, 800),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip'),
+                    dict(
+                        type='Normalize',
+                        mean=[123.675, 116.28, 103.53],
+                        std=[58.395, 57.12, 57.375],
+                        to_rgb=True),
+                    dict(type='Pad', size_divisor=32),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ],
+        samples_per_gpu=2  # Batch size of a single GPU used in testing
+    ))
+evaluation = dict(  # The config to build the evaluation hook, refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/evaluation/eval_hooks.py#L7 for more details.
+    interval=1,  # Evaluation interval
+    metric=['bbox', 'segm'])  # Metrics used during evaluation
+optimizer = dict(  # Config used to build optimizer, support all the optimizers in PyTorch whose arguments are also the same as those in PyTorch
+    type='SGD',  # Type of optimizers, refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/optimizer/default_constructor.py#L13 for more details
+    lr=0.02,  # Learning rate of optimizers, see detail usages of the parameters in the documentation of PyTorch
+    momentum=0.9,  # Momentum
+    weight_decay=0.0001)  # Weight decay of SGD
+optimizer_config = dict(  # Config used to build the optimizer hook, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/optimizer.py#L8 for implementation details.
+    grad_clip=None)  # Most of the methods do not use gradient clip
+lr_config = dict(  # Learning rate scheduler config used to register LrUpdater hook
+    policy='step',  # The policy of scheduler, also support CosineAnnealing, Cyclic, etc. Refer to details of supported LrUpdater from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9.
+    warmup='linear',  # The warmup policy, also support `exp` and `constant`.
+    warmup_iters=500,  # The number of iterations for warmup
+    warmup_ratio=
+    0.001,  # The ratio of the starting learning rate used for warmup
+    step=[8, 11])  # Steps to decay the learning rate
+runner = dict(
+    type='EpochBasedRunner', # Type of runner to use (i.e. IterBasedRunner or EpochBasedRunner)
+    max_epochs=12) # Runner that runs the workflow in total max_epochs. For IterBasedRunner use `max_iters`
+checkpoint_config = dict(  # Config to set the checkpoint hook, Refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py for implementation.
+    interval=1)  # The save interval is 1
+log_config = dict(  # config to register logger hook
+    interval=50,  # Interval to print the log
+    hooks=[
+        dict(type='TextLoggerHook', by_epoch=False),
+        dict(type='TensorboardLoggerHook', by_epoch=False),
+        dict(type='MMDetWandbHook', by_epoch=False, # The Wandb logger is also supported, It requires `wandb` to be installed.
+             init_kwargs={'entity': "OpenMMLab", # The entity used to log on Wandb
+                          'project': "MMDet", # Project name in WandB
+                          'config': cfg_dict}), # Check https://docs.wandb.ai/ref/python/init for more init arguments.
+        # MMDetWandbHook is mmdet implementation of WandbLoggerHook. ClearMLLoggerHook, DvcliveLoggerHook, MlflowLoggerHook, NeptuneLoggerHook, PaviLoggerHook, SegmindLoggerHook are also supported based on MMCV implementation.
+    ])  # The logger used to record the training process.
+
+dist_params = dict(backend='nccl')  # Parameters to setup distributed training, the port can also be set.
+log_level = 'INFO'  # The level of logging.
+load_from = None  # load models as a pre-trained model from a given path. This will not resume training.
+resume_from = None  # Resume checkpoints from a given path, the training will be resumed from the epoch when the checkpoint's is saved.
+workflow = [('train', 1)]  # Workflow for runner. [('train', 1)] means there is only one workflow and the workflow named 'train' is executed once. The workflow trains the model by 12 epochs according to the total_epochs.
+work_dir = 'work_dir'  # Directory to save the model checkpoints and logs for the current experiments.
+```
+
+## FAQ
+
+### Ignore some fields in the base configs
+
+Sometimes, you may set `_delete_=True` to ignore some of fields in base configs.
+You may refer to [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html#inherit-from-base-config-with-ignored-fields) for simple illustration.
+
+In MMDetection, for example, to change the backbone of Mask R-CNN with the following config.
+
+```python
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(...),
+    rpn_head=dict(...),
+    roi_head=dict(...))
+```
+
+`ResNet` and `HRNet` use different keywords to construct.
+
+```python
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w32',
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256)))),
+    neck=dict(...))
+```
+
+The `_delete_=True` would replace all old keys in `backbone` field with new keys.
+
+### Use intermediate variables in configs
+
+Some intermediate variables are used in the configs files, like `train_pipeline`/`test_pipeline` in datasets.
+It's worth noting that when modifying intermediate variables in the children configs, user need to pass the intermediate variables into corresponding fields again.
+For example, we would like to use multi scale strategy to train a Mask R-CNN. `train_pipeline`/`test_pipeline` are intermediate variable we would like modify.
+
+```python
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode="value",
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+```
+
+We first define the new `train_pipeline`/`test_pipeline` and pass them into `data`.
+
+Similarly, if we would like to switch from `SyncBN` to `BN` or `MMSyncBN`, we need to substitute every `norm_cfg` in the config.
+
+```python
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    backbone=dict(norm_cfg=norm_cfg),
+    neck=dict(norm_cfg=norm_cfg),
+    ...)
+```
diff --git a/docs/en/tutorials/customize_dataset.md b/docs/en/tutorials/customize_dataset.md
new file mode 100755
index 0000000..3237f16
--- /dev/null
+++ b/docs/en/tutorials/customize_dataset.md
@@ -0,0 +1,542 @@
+# Tutorial 2: Customize Datasets
+
+## Support new data format
+
+To support a new data format, you can either convert them to existing formats (COCO format or PASCAL format) or directly convert them to the middle format. You could also choose to convert them offline (before training by a script) or online (implement a new dataset and do the conversion at training). In MMDetection, we recommend to convert the data into COCO formats and do the conversion offline, thus you only need to modify the config's data annotation paths and classes after the conversion of your data.
+
+### Reorganize new data formats to existing format
+
+The simplest way is to convert your dataset to existing dataset formats (COCO or PASCAL VOC).
+
+The annotation json files in COCO format has the following necessary keys:
+
+```python
+'images': [
+    {
+        'file_name': 'COCO_val2014_000000001268.jpg',
+        'height': 427,
+        'width': 640,
+        'id': 1268
+    },
+    ...
+],
+
+'annotations': [
+    {
+        'segmentation': [[192.81,
+            247.09,
+            ...
+            219.03,
+            249.06]],  # if you have mask labels
+        'area': 1035.749,
+        'iscrowd': 0,
+        'image_id': 1268,
+        'bbox': [192.81, 224.8, 74.73, 33.43],
+        'category_id': 16,
+        'id': 42986
+    },
+    ...
+],
+
+'categories': [
+    {'id': 0, 'name': 'car'},
+ ]
+```
+
+There are three necessary keys in the json file:
+
+- `images`: contains a list of images with their information like `file_name`, `height`, `width`, and `id`.
+- `annotations`: contains the list of instance annotations.
+- `categories`: contains the list of categories names and their ID.
+
+After the data pre-processing, there are two steps for users to train the customized new dataset with existing format (e.g. COCO format):
+
+1. Modify the config file for using the customized dataset.
+2. Check the annotations of the customized dataset.
+
+Here we give an example to show the above two steps, which uses a customized dataset of 5 classes with COCO format to train an existing Cascade Mask R-CNN R50-FPN detector.
+
+#### 1. Modify the config file for using the customized dataset
+
+There are two aspects involved in the modification of config file:
+
+1. The `data` field. Specifically, you need to explicitly add the `classes` fields in `data.train`, `data.val` and `data.test`.
+2. The `num_classes` field in the `model` part. Explicitly over-write all the `num_classes` from default value (e.g. 80 in COCO) to your classes number.
+
+In `configs/my_custom_config.py`:
+
+```python
+
+# the new config inherits the base configs to highlight the necessary modification
+_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
+
+# 1. dataset settings
+dataset_type = 'CocoDataset'
+classes = ('a', 'b', 'c', 'd', 'e')
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        # explicitly add your class names to the field `classes`
+        classes=classes,
+        ann_file='path/to/your/train/annotation_data',
+        img_prefix='path/to/your/train/image_data'),
+    val=dict(
+        type=dataset_type,
+        # explicitly add your class names to the field `classes`
+        classes=classes,
+        ann_file='path/to/your/val/annotation_data',
+        img_prefix='path/to/your/val/image_data'),
+    test=dict(
+        type=dataset_type,
+        # explicitly add your class names to the field `classes`
+        classes=classes,
+        ann_file='path/to/your/test/annotation_data',
+        img_prefix='path/to/your/test/image_data'))
+
+# 2. model settings
+
+# explicitly over-write all the `num_classes` field from default 80 to 5.
+model = dict(
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                # explicitly over-write all the `num_classes` field from default 80 to 5.
+                num_classes=5),
+            dict(
+                type='Shared2FCBBoxHead',
+                # explicitly over-write all the `num_classes` field from default 80 to 5.
+                num_classes=5),
+            dict(
+                type='Shared2FCBBoxHead',
+                # explicitly over-write all the `num_classes` field from default 80 to 5.
+                num_classes=5)],
+    # explicitly over-write all the `num_classes` field from default 80 to 5.
+    mask_head=dict(num_classes=5)))
+```
+
+#### 2. Check the annotations of the customized dataset
+
+Assuming your customized dataset is COCO format, make sure you have the correct annotations in the customized dataset:
+
+1. The length for `categories` field in annotations should exactly equal the tuple length of `classes` fields in your config, meaning the number of classes (e.g. 5 in this example).
+2. The `classes` fields in your config file should have exactly the same elements and the same order with the `name` in `categories` of annotations. MMDetection automatically maps the uncontinuous `id` in `categories` to the continuous label indices, so the string order of `name` in `categories` field affects the order of label indices. Meanwhile, the string order of `classes` in config affects the label text during visualization of predicted bounding boxes.
+3. The `category_id` in `annotations` field should be valid, i.e., all values in `category_id` should belong to `id` in `categories`.
+
+Here is a valid example of annotations:
+
+```python
+
+'annotations': [
+    {
+        'segmentation': [[192.81,
+            247.09,
+            ...
+            219.03,
+            249.06]],  # if you have mask labels
+        'area': 1035.749,
+        'iscrowd': 0,
+        'image_id': 1268,
+        'bbox': [192.81, 224.8, 74.73, 33.43],
+        'category_id': 16,
+        'id': 42986
+    },
+    ...
+],
+
+# MMDetection automatically maps the uncontinuous `id` to the continuous label indices.
+'categories': [
+    {'id': 1, 'name': 'a'}, {'id': 3, 'name': 'b'}, {'id': 4, 'name': 'c'}, {'id': 16, 'name': 'd'}, {'id': 17, 'name': 'e'},
+ ]
+```
+
+We use this way to support CityScapes dataset. The script is in [cityscapes.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/dataset_converters/cityscapes.py) and we also provide the finetuning [configs](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes).
+
+**Note**
+
+1. For instance segmentation datasets, **MMDetection only supports evaluating mask AP of dataset in COCO format for now**.
+2. It is recommended to convert the data offline before training, thus you can still use `CocoDataset` and only need to modify the path of annotations and the training classes.
+
+### Reorganize new data format to middle format
+
+It is also fine if you do not want to convert the annotation format to COCO or PASCAL format.
+Actually, we define a simple annotation format and all existing datasets are
+processed to be compatible with it, either online or offline.
+
+The annotation of a dataset is a list of dict, each dict corresponds to an image.
+There are 3 field `filename` (relative path), `width`, `height` for testing,
+and an additional field `ann` for training. `ann` is also a dict containing at least 2 fields:
+`bboxes` and `labels`, both of which are numpy arrays. Some datasets may provide
+annotations like crowd/difficult/ignored bboxes, we use `bboxes_ignore` and `labels_ignore`
+to cover them.
+
+Here is an example.
+
+```python
+
+[
+    {
+        'filename': 'a.jpg',
+        'width': 1280,
+        'height': 720,
+        'ann': {
+            'bboxes': <np.ndarray, float32> (n, 4),
+            'labels': <np.ndarray, int64> (n, ),
+            'bboxes_ignore': <np.ndarray, float32> (k, 4),
+            'labels_ignore': <np.ndarray, int64> (k, ) (optional field)
+        }
+    },
+    ...
+]
+```
+
+There are two ways to work with custom datasets.
+
+- online conversion
+
+  You can write a new Dataset class inherited from `CustomDataset`, and overwrite two methods
+  `load_annotations(self, ann_file)` and `get_ann_info(self, idx)`,
+  like [CocoDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py) and [VOCDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/voc.py).
+
+- offline conversion
+
+  You can convert the annotation format to the expected format above and save it to
+  a pickle or json file, like [pascal_voc.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/dataset_converters/pascal_voc.py).
+  Then you can simply use `CustomDataset`.
+
+### An example of customized dataset
+
+Assume the annotation is in a new format in text files.
+The bounding boxes annotations are stored in text file `annotation.txt` as the following
+
+```
+#
+000001.jpg
+1280 720
+2
+10 20 40 60 1
+20 40 50 60 2
+#
+000002.jpg
+1280 720
+3
+50 20 40 60 2
+20 40 30 45 2
+30 40 50 60 3
+```
+
+We can create a new dataset in `mmdet/datasets/my_dataset.py` to load the data.
+
+```python
+import mmcv
+import numpy as np
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class MyDataset(CustomDataset):
+
+    CLASSES = ('person', 'bicycle', 'car', 'motorcycle')
+
+    def load_annotations(self, ann_file):
+        ann_list = mmcv.list_from_file(ann_file)
+
+        data_infos = []
+        for i, ann_line in enumerate(ann_list):
+            if ann_line != '#':
+                continue
+
+            img_shape = ann_list[i + 2].split(' ')
+            width = int(img_shape[0])
+            height = int(img_shape[1])
+            bbox_number = int(ann_list[i + 3])
+
+            anns = ann_line.split(' ')
+            bboxes = []
+            labels = []
+            for anns in ann_list[i + 4:i + 4 + bbox_number]:
+                bboxes.append([float(ann) for ann in anns[:4]])
+                labels.append(int(anns[4]))
+
+            data_infos.append(
+                dict(
+                    filename=ann_list[i + 1],
+                    width=width,
+                    height=height,
+                    ann=dict(
+                        bboxes=np.array(bboxes).astype(np.float32),
+                        labels=np.array(labels).astype(np.int64))
+                ))
+
+        return data_infos
+
+    def get_ann_info(self, idx):
+        return self.data_infos[idx]['ann']
+
+```
+
+Then in the config, to use `MyDataset` you can modify the config as the following
+
+```python
+dataset_A_train = dict(
+    type='MyDataset',
+    ann_file = 'image_list.txt',
+    pipeline=train_pipeline
+)
+```
+
+## Customize datasets by dataset wrappers
+
+MMDetection also supports many dataset wrappers to mix the dataset or modify the dataset distribution for training.
+Currently it supports to three dataset wrappers as below:
+
+- `RepeatDataset`: simply repeat the whole dataset.
+- `ClassBalancedDataset`: repeat dataset in a class balanced manner.
+- `ConcatDataset`: concat datasets.
+
+### Repeat dataset
+
+We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset is `Dataset_A`, to repeat it, the config looks like the following
+
+```python
+dataset_A_train = dict(
+        type='RepeatDataset',
+        times=N,
+        dataset=dict(  # This is the original config of Dataset_A
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+### Class balanced dataset
+
+We use `ClassBalancedDataset` as wrapper to repeat the dataset based on category
+frequency. The dataset to repeat needs to instantiate function `self.get_cat_ids(idx)`
+to support `ClassBalancedDataset`.
+For example, to repeat `Dataset_A` with `oversample_thr=1e-3`, the config looks like the following
+
+```python
+dataset_A_train = dict(
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(  # This is the original config of Dataset_A
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+You may refer to [source code](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/dataset_wrappers.py#L211) for details.
+
+### Concatenate dataset
+
+There are three ways to concatenate the dataset.
+
+1. If the datasets you want to concatenate are in the same type with different annotation files, you can concatenate the dataset configs like the following.
+
+   ```python
+   dataset_A_train = dict(
+       type='Dataset_A',
+       ann_file = ['anno_file_1', 'anno_file_2'],
+       pipeline=train_pipeline
+   )
+   ```
+
+   If the concatenated dataset is used for test or evaluation, this manner supports to evaluate each dataset separately. To test the concatenated datasets as a whole, you can set `separate_eval=False` as below.
+
+   ```python
+   dataset_A_train = dict(
+       type='Dataset_A',
+       ann_file = ['anno_file_1', 'anno_file_2'],
+       separate_eval=False,
+       pipeline=train_pipeline
+   )
+   ```
+
+2. In case the dataset you want to concatenate is different, you can concatenate the dataset configs like the following.
+
+   ```python
+   dataset_A_train = dict()
+   dataset_B_train = dict()
+
+   data = dict(
+       imgs_per_gpu=2,
+       workers_per_gpu=2,
+       train = [
+           dataset_A_train,
+           dataset_B_train
+       ],
+       val = dataset_A_val,
+       test = dataset_A_test
+       )
+   ```
+
+   If the concatenated dataset is used for test or evaluation, this manner also supports to evaluate each dataset separately.
+
+3. We also support to define `ConcatDataset` explicitly as the following.
+
+   ```python
+   dataset_A_val = dict()
+   dataset_B_val = dict()
+
+   data = dict(
+       imgs_per_gpu=2,
+       workers_per_gpu=2,
+       train=dataset_A_train,
+       val=dict(
+           type='ConcatDataset',
+           datasets=[dataset_A_val, dataset_B_val],
+           separate_eval=False))
+   ```
+
+   This manner allows users to evaluate all the datasets as a single one by setting `separate_eval=False`.
+
+**Note:**
+
+1. The option `separate_eval=False` assumes the datasets use `self.data_infos` during evaluation. Therefore, COCO datasets do not support this behavior since COCO datasets do not fully rely on `self.data_infos` for evaluation. Combining different types of datasets and evaluating them as a whole is not tested thus is not suggested.
+2. Evaluating `ClassBalancedDataset` and `RepeatDataset` is not supported thus evaluating concatenated datasets of these types is also not supported.
+
+A more complex example that repeats `Dataset_A` and `Dataset_B` by N and M times, respectively, and then concatenates the repeated datasets is as the following.
+
+```python
+dataset_A_train = dict(
+    type='RepeatDataset',
+    times=N,
+    dataset=dict(
+        type='Dataset_A',
+        ...
+        pipeline=train_pipeline
+    )
+)
+dataset_A_val = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_A_test = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_B_train = dict(
+    type='RepeatDataset',
+    times=M,
+    dataset=dict(
+        type='Dataset_B',
+        ...
+        pipeline=train_pipeline
+    )
+)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train = [
+        dataset_A_train,
+        dataset_B_train
+    ],
+    val = dataset_A_val,
+    test = dataset_A_test
+)
+
+```
+
+## Modify Dataset Classes
+
+With existing dataset types, we can modify the class names of them to train subset of the annotations.
+For example, if you want to train only three classes of the current dataset,
+you can modify the classes of dataset.
+The dataset will filter out the ground truth boxes of other classes automatically.
+
+```python
+classes = ('person', 'bicycle', 'car')
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+```
+
+MMDetection V2.0 also supports to read the classes from a file, which is common in real applications.
+For example, assume the `classes.txt` contains the name of classes as the following.
+
+```
+person
+bicycle
+car
+```
+
+Users can set the classes as a file path, the dataset will load it and convert it to a list automatically.
+
+```python
+classes = 'path/to/classes.txt'
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+```
+
+**Note**:
+
+- Before MMDetection v2.5.0, the dataset will filter out the empty GT images automatically if the classes are set and there is no way to disable that through config. This is an undesirable behavior and introduces confusion because if the classes are not set, the dataset only filter the empty GT images when `filter_empty_gt=True` and `test_mode=False`. After MMDetection v2.5.0, we decouple the image filtering process and the classes modification, i.e., the dataset will only filter empty GT images when `filter_empty_gt=True` and `test_mode=False`, no matter whether the classes are set. Thus, setting the classes only influences the annotations of classes used for training and users could decide whether to filter empty GT images by themselves.
+- Since the middle format only has box labels and does not contain the class names, when using `CustomDataset`, users cannot filter out the empty GT images through configs but only do this offline.
+- Please remember to modify the `num_classes` in the head when specifying `classes` in dataset. We implemented [NumClassCheckHook](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/utils.py) to check whether the numbers are consistent since v2.9.0(after PR#4508).
+- The features for setting dataset classes and dataset filtering will be refactored to be more user-friendly in the future (depends on the progress).
+
+## COCO Panoptic Dataset
+
+Now we support COCO Panoptic Dataset, the format of panoptic annotations is different from COCO format.
+Both the foreground and the background will exist in the annotation file.
+The annotation json files in COCO Panoptic format has the following necessary keys:
+
+```python
+'images': [
+    {
+        'file_name': '000000001268.jpg',
+        'height': 427,
+        'width': 640,
+        'id': 1268
+    },
+    ...
+]
+
+'annotations': [
+    {
+        'filename': '000000001268.jpg',
+        'image_id': 1268,
+        'segments_info': [
+            {
+                'id':8345037,  # One-to-one correspondence with the id in the annotation map.
+                'category_id': 51,
+                'iscrowd': 0,
+                'bbox': (x1, y1, w, h),  # The bbox of the background is the outer rectangle of its mask.
+                'area': 24315
+            },
+            ...
+        ]
+    },
+    ...
+]
+
+'categories': [  # including both foreground categories and background categories
+    {'id': 0, 'name': 'person'},
+    ...
+ ]
+```
+
+Moreover, the `seg_prefix` must be set to the path of the panoptic annotation images.
+
+```python
+data = dict(
+    type='CocoPanopticDataset',
+    train=dict(
+        seg_prefix = 'path/to/your/train/panoptic/image_annotation_data'
+    ),
+    val=dict(
+        seg_prefix = 'path/to/your/train/panoptic/image_annotation_data'
+    )
+)
+```
diff --git a/docs/en/tutorials/customize_losses.md b/docs/en/tutorials/customize_losses.md
new file mode 100755
index 0000000..5c00368
--- /dev/null
+++ b/docs/en/tutorials/customize_losses.md
@@ -0,0 +1,126 @@
+# Tutorial 6: Customize Losses
+
+MMDetection provides users with different loss functions. But the default configuration may be not applicable for different datasets or models, so users may want to modify a specific loss to adapt the new situation.
+
+This tutorial first elaborate the computation pipeline of losses, then give some instructions about how to modify each step. The modification can be categorized as tweaking and weighting.
+
+## Computation pipeline of a loss
+
+Given the input prediction and target, as well as the weights, a loss function maps the input tensor to the final loss scalar. The mapping can be divided into five steps:
+
+1. Set the sampling method to sample positive and negative samples.
+
+2. Get **element-wise** or **sample-wise** loss by the loss kernel function.
+
+3. Weighting the loss with a weight tensor **element-wisely**.
+
+4. Reduce the loss tensor to a **scalar**.
+
+5. Weighting the loss with a **scalar**.
+
+## Set sampling method (step 1)
+
+For some loss functions, sampling strategies are needed to avoid imbalance between positive and negative samples.
+
+For example, when using `CrossEntropyLoss` in RPN head, we need to set `RandomSampler` in `train_cfg`
+
+```python
+train_cfg=dict(
+    rpn=dict(
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False))
+```
+
+For some other losses which have positive and negative sample balance mechanism such as Focal Loss, GHMC, and QualityFocalLoss, the sampler is no more necessary.
+
+## Tweaking loss
+
+Tweaking a loss is more related with step 2, 4, 5, and most modifications can be specified in the config.
+Here we take [Focal Loss (FL)](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/focal_loss.py) as an example.
+The following code sniper are the construction method and config of FL respectively, they are actually one to one correspondence.
+
+```python
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0):
+```
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=1.0)
+```
+
+### Tweaking hyper-parameters (step 2)
+
+`gamma` and `beta` are two hyper-parameters in the Focal Loss. Say if we want to change the value of `gamma` to be 1.5 and `alpha` to be 0.5, then we can specify them in the config as follows:
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=1.5,
+    alpha=0.5,
+    loss_weight=1.0)
+```
+
+### Tweaking the way of reduction (step 3)
+
+The default way of reduction is `mean` for FL. Say if we want to change the reduction from `mean` to `sum`, we can specify it in the config as follows:
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=1.0,
+    reduction='sum')
+```
+
+### Tweaking loss weight (step 5)
+
+The loss weight here is a scalar which controls the weight of different losses in multi-task learning, e.g. classification loss and regression loss. Say if we want to change to loss weight of classification loss to be 0.5, we can specify it in the config as follows:
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=0.5)
+```
+
+## Weighting loss (step 3)
+
+Weighting loss means we re-weight the loss element-wisely. To be more specific, we multiply the loss tensor with a weight tensor which has the same shape. As a result, different entries of the loss can be scaled differently, and so called element-wisely.
+The loss weight varies across different models and highly context related, but overall there are two kinds of loss weights, `label_weights` for classification loss and `bbox_weights` for bbox regression loss. You can find them in the `get_target` method of the corresponding head. Here we take [ATSSHead](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/atss_head.py#L530) as an example, which inherit [AnchorHead](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/anchor_head.py) but overwrite its `get_targets` method which yields different `label_weights` and `bbox_weights`.
+
+```
+class ATSSHead(AnchorHead):
+
+    ...
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+```
diff --git a/docs/en/tutorials/customize_models.md b/docs/en/tutorials/customize_models.md
new file mode 100755
index 0000000..81c3912
--- /dev/null
+++ b/docs/en/tutorials/customize_models.md
@@ -0,0 +1,363 @@
+# Tutorial 4: Customize Models
+
+We basically categorize model components into 5 types.
+
+- backbone: usually an FCN network to extract feature maps, e.g., ResNet, MobileNet.
+- neck: the component between backbones and heads, e.g., FPN, PAFPN.
+- head: the component for specific tasks, e.g., bbox prediction and mask prediction.
+- roi extractor: the part for extracting RoI features from feature maps, e.g., RoI Align.
+- loss: the component in head for calculating losses, e.g., FocalLoss, L1Loss, and GHMLoss.
+
+## Develop new components
+
+### Add a new backbone
+
+Here we show how to develop new components with an example of MobileNet.
+
+#### 1. Define a new backbone (e.g. MobileNet)
+
+Create a new file `mmdet/models/backbones/mobilenet.py`.
+
+```python
+import torch.nn as nn
+
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class MobileNet(nn.Module):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # should return a tuple
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet/models/backbones/__init__.py`
+
+```python
+from .mobilenet import MobileNet
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet.models.backbones.mobilenet'],
+    allow_failed_imports=False)
+```
+
+to the config file to avoid modifying the original code.
+
+#### 3. Use the backbone in your config file
+
+```python
+model = dict(
+    ...
+    backbone=dict(
+        type='MobileNet',
+        arg1=xxx,
+        arg2=xxx),
+    ...
+```
+
+### Add new necks
+
+#### 1. Define a neck (e.g. PAFPN)
+
+Create a new file `mmdet/models/necks/pafpn.py`.
+
+```python
+from ..builder import NECKS
+
+@NECKS.register_module()
+class PAFPN(nn.Module):
+
+    def __init__(self,
+                in_channels,
+                out_channels,
+                num_outs,
+                start_level=0,
+                end_level=-1,
+                add_extra_convs=False):
+        pass
+
+    def forward(self, inputs):
+        # implementation is ignored
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet/models/necks/__init__.py`,
+
+```python
+from .pafpn import PAFPN
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet.models.necks.pafpn.py'],
+    allow_failed_imports=False)
+```
+
+to the config file and avoid modifying the original code.
+
+#### 3. Modify the config file
+
+```python
+neck=dict(
+    type='PAFPN',
+    in_channels=[256, 512, 1024, 2048],
+    out_channels=256,
+    num_outs=5)
+```
+
+### Add new heads
+
+Here we show how to develop a new head with the example of [Double Head R-CNN](https://arxiv.org/abs/1904.06493) as the following.
+
+First, add a new bbox head in `mmdet/models/roi_heads/bbox_heads/double_bbox_head.py`.
+Double Head R-CNN implements a new bbox head for object detection.
+To implement a bbox head, basically we need to implement three functions of the new module as the following.
+
+```python
+from mmdet.models.builder import HEADS
+from .bbox_head import BBoxHead
+
+@HEADS.register_module()
+class DoubleConvFCBBoxHead(BBoxHead):
+    r"""Bbox head used in Double-Head R-CNN
+
+                                      /-> cls
+                  /-> shared convs ->
+                                      \-> reg
+    roi features
+                                      /-> cls
+                  \-> shared fc    ->
+                                      \-> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_convs=0,
+                 num_fcs=0,
+                 conv_out_channels=1024,
+                 fc_out_channels=1024,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 **kwargs):
+        kwargs.setdefault('with_avg_pool', True)
+        super(DoubleConvFCBBoxHead, self).__init__(**kwargs)
+
+
+    def forward(self, x_cls, x_reg):
+
+```
+
+Second, implement a new RoI Head if it is necessary. We plan to inherit the new `DoubleHeadRoIHead` from `StandardRoIHead`. We can find that a `StandardRoIHead` already implements the following functions.
+
+```python
+import torch
+
+from mmdet.core import bbox2result, bbox2roi, build_assigner, build_sampler
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_roi_head import BaseRoIHead
+from .test_mixins import BBoxTestMixin, MaskTestMixin
+
+
+@HEADS.register_module()
+class StandardRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin):
+    """Simplest base roi head including one bbox head and one mask head.
+    """
+
+    def init_assigner_sampler(self):
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+
+
+    def forward_dummy(self, x, proposals):
+
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+
+    def _bbox_forward(self, x, rois):
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+
+    def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
+
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation."""
+
+```
+
+Double Head's modification is mainly in the bbox_forward logic, and it inherits other logics from the `StandardRoIHead`.
+In the `mmdet/models/roi_heads/double_roi_head.py`, we implement the new RoI Head as the following:
+
+```python
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class DoubleHeadRoIHead(StandardRoIHead):
+    """RoI head for Double Head RCNN
+
+    https://arxiv.org/abs/1904.06493
+    """
+
+    def __init__(self, reg_roi_scale_factor, **kwargs):
+        super(DoubleHeadRoIHead, self).__init__(**kwargs)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x, rois):
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
+```
+
+Last, the users need to add the module in
+`mmdet/models/bbox_heads/__init__.py` and `mmdet/models/roi_heads/__init__.py` thus the corresponding registry could find and load them.
+
+Alternatively, the users can add
+
+```python
+custom_imports=dict(
+    imports=['mmdet.models.roi_heads.double_roi_head', 'mmdet.models.bbox_heads.double_bbox_head'])
+```
+
+to the config file and achieve the same goal.
+
+The config file of Double Head R-CNN is as the following
+
+```python
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        type='DoubleHeadRoIHead',
+        reg_roi_scale_factor=1.3,
+        bbox_head=dict(
+            _delete_=True,
+            type='DoubleConvFCBBoxHead',
+            num_convs=4,
+            num_fcs=2,
+            in_channels=256,
+            conv_out_channels=1024,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0))))
+
+```
+
+Since MMDetection 2.0, the config system supports to inherit configs such that the users can focus on the modification.
+The Double Head R-CNN mainly uses a new DoubleHeadRoIHead and a new
+`DoubleConvFCBBoxHead`, the arguments are set according to the `__init__` function of each module.
+
+### Add new loss
+
+Assume you want to add a new loss as `MyLoss`, for bounding box regression.
+To add a new loss function, the users need implement it in `mmdet/models/losses/my_loss.py`.
+The decorator `weighted_loss` enable the loss to be weighted for each element.
+
+```python
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+@weighted_loss
+def my_loss(pred, target):
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+@LOSSES.register_module()
+class MyLoss(nn.Module):
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(MyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * my_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+```
+
+Then the users need to add it in the `mmdet/models/losses/__init__.py`.
+
+```python
+from .my_loss import MyLoss, my_loss
+
+```
+
+Alternatively, you can add
+
+```python
+custom_imports=dict(
+    imports=['mmdet.models.losses.my_loss'])
+```
+
+to the config file and achieve the same goal.
+
+To use it, modify the `loss_xxx` field.
+Since MyLoss is for regression, you need to modify the `loss_bbox` field in the head.
+
+```python
+loss_bbox=dict(type='MyLoss', loss_weight=1.0))
+```
diff --git a/docs/en/tutorials/customize_runtime.md b/docs/en/tutorials/customize_runtime.md
new file mode 100755
index 0000000..f08d90b
--- /dev/null
+++ b/docs/en/tutorials/customize_runtime.md
@@ -0,0 +1,323 @@
+# Tutorial 5: Customize Runtime Settings
+
+## Customize optimization settings
+
+### Customize optimizer supported by Pytorch
+
+We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field of config files.
+For example, if you want to use `ADAM` (note that the performance could drop a lot), the modification could be as the following.
+
+```python
+optimizer = dict(type='Adam', lr=0.0003, weight_decay=0.0001)
+```
+
+To modify the learning rate of the model, the users only need to modify the `lr` in the config of optimizer. The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
+
+### Customize self-implemented optimizer
+
+#### 1. Define a new optimizer
+
+A customized optimizer could be defined as following.
+
+Assume you want to add a optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
+You need to create a new directory named `mmdet/core/optimizer`.
+And then implement the new optimizer in a file, e.g., in `mmdet/core/optimizer/my_optimizer.py`:
+
+```python
+from .registry import OPTIMIZERS
+from torch.optim import Optimizer
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c)
+
+```
+
+#### 2. Add the optimizer to registry
+
+To find the above module defined above, this module should be imported into the main namespace at first. There are two options to achieve it.
+
+- Modify `mmdet/core/optimizer/__init__.py` to import it.
+
+  The newly defined module should be imported in `mmdet/core/optimizer/__init__.py` so that the registry will
+  find the new module and add it:
+
+```python
+from .my_optimizer import MyOptimizer
+```
+
+- Use `custom_imports` in the config to manually import it
+
+```python
+custom_imports = dict(imports=['mmdet.core.optimizer.my_optimizer'], allow_failed_imports=False)
+```
+
+The module `mmdet.core.optimizer.my_optimizer` will be imported at the beginning of the program and the class `MyOptimizer` is then automatically registered.
+Note that only the package containing the class `MyOptimizer` should be imported.
+`mmdet.core.optimizer.my_optimizer.MyOptimizer` **cannot** be imported directly.
+
+Actually users can use a totally different file directory structure using this importing method, as long as the module root can be located in `PYTHONPATH`.
+
+#### 3. Specify the optimizer in the config file
+
+Then you can use `MyOptimizer` in `optimizer` field of config files.
+In the configs, the optimizers are defined by the field `optimizer` like the following:
+
+```python
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+```
+
+To use your own optimizer, the field can be changed to
+
+```python
+optimizer = dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)
+```
+
+### Customize optimizer constructor
+
+Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers.
+The users can do those fine-grained parameter tuning through customizing optimizer constructor.
+
+```python
+from mmcv.utils import build_from_cfg
+
+from mmcv.runner.optimizer import OPTIMIZER_BUILDERS, OPTIMIZERS
+from mmdet.utils import get_root_logger
+from .my_optimizer import MyOptimizer
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class MyOptimizerConstructor(object):
+
+    def __init__(self, optimizer_cfg, paramwise_cfg=None):
+
+    def __call__(self, model):
+
+        return my_optimizer
+
+```
+
+The default optimizer constructor is implemented [here](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/optimizer/default_constructor.py#L11), which could also serve as a template for new optimizer constructor.
+
+### Additional settings
+
+Tricks not implemented by the optimizer should be implemented through optimizer constructor (e.g., set parameter-wise learning rates) or hooks. We list some common settings that could stabilize the training or accelerate the training. Feel free to create PR, issue for more settings.
+
+- __Use gradient clip to stabilize training__:
+  Some models need gradient clip to clip the gradients to stabilize the training process. An example is as below:
+
+  ```python
+  optimizer_config = dict(
+      _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+  ```
+
+  If your config inherits the base config which already sets the `optimizer_config`, you might need `_delete_=True` to override the unnecessary settings. See the [config documentation](https://mmdetection.readthedocs.io/en/latest/tutorials/config.html) for more details.
+
+- __Use momentum schedule to accelerate model convergence__:
+  We support momentum scheduler to modify model's momentum according to learning rate, which could make the model converge in a faster way.
+  Momentum scheduler is usually used with LR scheduler, for example, the following config is used in 3D detection to accelerate convergence.
+  For more details, please refer to the implementation of [CyclicLrUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327) and [CyclicMomentumUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130).
+
+  ```python
+  lr_config = dict(
+      policy='cyclic',
+      target_ratio=(10, 1e-4),
+      cyclic_times=1,
+      step_ratio_up=0.4,
+  )
+  momentum_config = dict(
+      policy='cyclic',
+      target_ratio=(0.85 / 0.95, 1),
+      cyclic_times=1,
+      step_ratio_up=0.4,
+  )
+  ```
+
+## Customize training schedules
+
+By default we use step learning rate with 1x schedule, this calls [`StepLRHook`](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L153) in MMCV.
+We support many other learning rate schedule [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py), such as `CosineAnnealing` and `Poly` schedule. Here are some examples
+
+- Poly schedule:
+
+  ```python
+  lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+  ```
+
+- ConsineAnnealing schedule:
+
+  ```python
+  lr_config = dict(
+      policy='CosineAnnealing',
+      warmup='linear',
+      warmup_iters=1000,
+      warmup_ratio=1.0 / 10,
+      min_lr_ratio=1e-5)
+  ```
+
+## Customize workflow
+
+Workflow is a list of (phase, epochs) to specify the running order and epochs.
+By default it is set to be
+
+```python
+workflow = [('train', 1)]
+```
+
+which means running 1 epoch for training.
+Sometimes user may want to check some metrics (e.g. loss, accuracy) about the model on the validate set.
+In such case, we can set the workflow as
+
+```python
+[('train', 1), ('val', 1)]
+```
+
+so that 1 epoch for training and 1 epoch for validation will be run iteratively.
+
+**Note**:
+
+1. The parameters of model will not be updated during val epoch.
+2. Keyword `total_epochs` in the config only controls the number of training epochs and will not affect the validation workflow.
+3. Workflows `[('train', 1), ('val', 1)]` and `[('train', 1)]` will not change the behavior of `EvalHook` because `EvalHook` is called by `after_train_epoch` and validation workflow only affect hooks that are called through `after_val_epoch`. Therefore, the only difference between `[('train', 1), ('val', 1)]` and `[('train', 1)]` is that the runner will calculate losses on validation set after each training epoch.
+
+## Customize hooks
+
+### Customize self-implemented hooks
+
+#### 1. Implement a new hook
+
+There are some occasions when the users might need to implement a new hook. MMDetection supports customized hooks in training (#3395) since v2.3.0. Thus the users could implement a hook directly in mmdet or their mmdet-based codebases and use the hook by only modifying the config in training.
+Before v2.3.0, the users need to modify the code to get the hook registered before training starts.
+Here we give an example of creating a new hook in mmdet and using it in training.
+
+```python
+from mmcv.runner import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class MyHook(Hook):
+
+    def __init__(self, a, b):
+        pass
+
+    def before_run(self, runner):
+        pass
+
+    def after_run(self, runner):
+        pass
+
+    def before_epoch(self, runner):
+        pass
+
+    def after_epoch(self, runner):
+        pass
+
+    def before_iter(self, runner):
+        pass
+
+    def after_iter(self, runner):
+        pass
+```
+
+Depending on the functionality of the hook, the users need to specify what the hook will do at each stage of the training in `before_run`, `after_run`, `before_epoch`, `after_epoch`, `before_iter`, and `after_iter`.
+
+#### 2. Register the new hook
+
+Then we need to make `MyHook` imported. Assuming the file is in `mmdet/core/utils/my_hook.py` there are two ways to do that:
+
+- Modify `mmdet/core/utils/__init__.py` to import it.
+
+  The newly defined module should be imported in `mmdet/core/utils/__init__.py` so that the registry will
+  find the new module and add it:
+
+```python
+from .my_hook import MyHook
+```
+
+- Use `custom_imports` in the config to manually import it
+
+```python
+custom_imports = dict(imports=['mmdet.core.utils.my_hook'], allow_failed_imports=False)
+```
+
+#### 3. Modify the config
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value)
+]
+```
+
+You can also set the priority of the hook by adding key `priority` to `'NORMAL'` or `'HIGHEST'` as below
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
+]
+```
+
+By default the hook's priority is set as `NORMAL` during registration.
+
+### Use hooks implemented in MMCV
+
+If the hook is already implemented in MMCV, you can directly modify the config to use the hook as below
+
+#### 4. Example: `NumClassCheckHook`
+
+We implement a customized hook named  [NumClassCheckHook](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/utils.py) to check whether the `num_classes` in head matches the length of `CLASSES` in `dataset`.
+
+We set it in [default_runtime.py](https://github.com/open-mmlab/mmdetection/blob/master/configs/_base_/default_runtime.py).
+
+```python
+custom_hooks = [dict(type='NumClassCheckHook')]
+```
+
+### Modify default runtime hooks
+
+There are some common hooks that are not registered through `custom_hooks`, they are
+
+- log_config
+- checkpoint_config
+- evaluation
+- lr_config
+- optimizer_config
+- momentum_config
+
+In those hooks, only the logger hook has the `VERY_LOW` priority, others' priority are `NORMAL`.
+The above-mentioned tutorials already covers how to modify `optimizer_config`, `momentum_config`, and `lr_config`.
+Here we reveal how what we can do with `log_config`, `checkpoint_config`, and `evaluation`.
+
+#### Checkpoint config
+
+The MMCV runner will use `checkpoint_config` to initialize [`CheckpointHook`](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/hooks/checkpoint.py#L9).
+
+```python
+checkpoint_config = dict(interval=1)
+```
+
+The users could set `max_keep_ckpts` to save only small number of checkpoints or decide whether to store state dict of optimizer by `save_optimizer`. More details of the arguments are [here](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.CheckpointHook)
+
+#### Log config
+
+The `log_config` wraps multiple logger hooks and enables to set intervals. Now MMCV supports `WandbLoggerHook`, `MlflowLoggerHook`, and `TensorboardLoggerHook`.
+The detail usages can be found in the [doc](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook).
+
+```python
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+```
+
+#### Evaluation config
+
+The config of `evaluation` will be used to initialize the [`EvalHook`](https://github.com/open-mmlab/mmdetection/blob/7a404a2c000620d52156774a5025070d9e00d918/mmdet/core/evaluation/eval_hooks.py#L8).
+Except the key `interval`, other arguments such as `metric` will be passed to the `dataset.evaluate()`
+
+```python
+evaluation = dict(interval=1, metric='bbox')
+```
diff --git a/docs/en/tutorials/data_pipeline.md b/docs/en/tutorials/data_pipeline.md
new file mode 100755
index 0000000..57a6db4
--- /dev/null
+++ b/docs/en/tutorials/data_pipeline.md
@@ -0,0 +1,199 @@
+# Tutorial 3: Customize Data Pipelines
+
+## Design of Data pipelines
+
+Following typical conventions, we use `Dataset` and `DataLoader` for data loading
+with multiple workers. `Dataset` returns a dict of data items corresponding
+the arguments of models' forward method.
+Since the data in object detection may not be the same size (image size, gt bbox size, etc.),
+we introduce a new `DataContainer` type in MMCV to help collect and distribute
+data of different size.
+See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
+
+The data preparation pipeline and the dataset is decomposed. Usually a dataset
+defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
+A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
+
+We present a classical pipeline in the following figure. The blue blocks are pipeline operations. With the pipeline going on, each operator can add new keys (marked as green) to the result dict or update the existing keys (marked as orange).
+![pipeline figure](../../../resources/data_pipeline.png)
+
+The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
+
+Here is a pipeline example for Faster R-CNN.
+
+```python
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+```
+
+For each operation, we list the related dict fields that are added/updated/removed.
+
+### Data loading
+
+`LoadImageFromFile`
+
+- add: img, img_shape, ori_shape
+
+`LoadAnnotations`
+
+- add: gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg, bbox_fields, mask_fields
+
+`LoadProposals`
+
+- add: proposals
+
+### Pre-processing
+
+`Resize`
+
+- add: scale, scale_idx, pad_shape, scale_factor, keep_ratio
+- update: img, img_shape, \*bbox_fields, \*mask_fields, \*seg_fields
+
+`RandomFlip`
+
+- add: flip
+- update: img, \*bbox_fields, \*mask_fields, \*seg_fields
+
+`Pad`
+
+- add: pad_fixed_size, pad_size_divisor
+- update: img, pad_shape, \*mask_fields, \*seg_fields
+
+`RandomCrop`
+
+- update: img, pad_shape, gt_bboxes, gt_labels, gt_masks, \*bbox_fields
+
+`Normalize`
+
+- add: img_norm_cfg
+- update: img
+
+`SegRescale`
+
+- update: gt_semantic_seg
+
+`PhotoMetricDistortion`
+
+- update: img
+
+`Expand`
+
+- update: img, gt_bboxes
+
+`MinIoURandomCrop`
+
+- update: img, gt_bboxes, gt_labels
+
+`Corrupt`
+
+- update: img
+
+### Formatting
+
+`ToTensor`
+
+- update: specified by `keys`.
+
+`ImageToTensor`
+
+- update: specified by `keys`.
+
+`Transpose`
+
+- update: specified by `keys`.
+
+`ToDataContainer`
+
+- update: specified by `fields`.
+
+`DefaultFormatBundle`
+
+- update: img, proposals, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg
+
+`Collect`
+
+- add: img_meta (the keys of img_meta is specified by `meta_keys`)
+- remove: all other keys except for those specified by `keys`
+
+### Test time augmentation
+
+`MultiScaleFlipAug`
+
+## Extend and use custom pipelines
+
+1. Write a new pipeline in a file, e.g., in `my_pipeline.py`. It takes a dict as input and returns a dict.
+
+   ```python
+   import random
+   from mmdet.datasets import PIPELINES
+
+
+   @PIPELINES.register_module()
+   class MyTransform:
+       """Add your transform
+
+       Args:
+           p (float): Probability of shifts. Default 0.5.
+       """
+
+       def __init__(self, p=0.5):
+           self.p = p
+
+       def __call__(self, results):
+           if random.random() > self.p:
+               results['dummy'] = True
+           return results
+   ```
+
+2. Import and use the pipeline in your config file.
+   Make sure the import is relative to where your train script is located.
+
+   ```python
+   custom_imports = dict(imports=['path.to.my_pipeline'], allow_failed_imports=False)
+
+   img_norm_cfg = dict(
+       mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+   train_pipeline = [
+       dict(type='LoadImageFromFile'),
+       dict(type='LoadAnnotations', with_bbox=True),
+       dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+       dict(type='RandomFlip', flip_ratio=0.5),
+       dict(type='Normalize', **img_norm_cfg),
+       dict(type='Pad', size_divisor=32),
+       dict(type='MyTransform', p=0.2),
+       dict(type='DefaultFormatBundle'),
+       dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+   ]
+   ```
+
+3. Visualize the output of your augmentation pipeline
+
+   To visualize the output of your augmentation pipeline, `tools/misc/browse_dataset.py`
+   can help the user to browse a detection dataset (both images and bounding box annotations)
+   visually, or save the image to a designated directory. More details can refer to
+   [useful_tools](../useful_tools.md)
diff --git a/docs/en/tutorials/finetune.md b/docs/en/tutorials/finetune.md
new file mode 100755
index 0000000..afa5021
--- /dev/null
+++ b/docs/en/tutorials/finetune.md
@@ -0,0 +1,89 @@
+# Tutorial 7: Finetuning Models
+
+Detectors pre-trained on the COCO dataset can serve as a good pre-trained model for other datasets, e.g., CityScapes and KITTI Dataset.
+This tutorial provides instruction for users to use the models provided in the [Model Zoo](../model_zoo.md) for other datasets to obtain better performance.
+
+There are two steps to finetune a model on a new dataset.
+
+- Add support for the new dataset following [Tutorial 2: Customize Datasets](customize_dataset.md).
+- Modify the configs as will be discussed in this tutorial.
+
+Take the finetuning process on Cityscapes Dataset as an example, the users need to modify five parts in the config.
+
+## Inherit base configs
+
+To release the burden and reduce bugs in writing the whole configs, MMDetection V2.0 support inheriting configs from multiple existing configs. To finetune a Mask RCNN model, the new config needs to inherit
+`_base_/models/mask_rcnn_r50_fpn.py` to build the basic structure of the model. To use the Cityscapes Dataset, the new config can also simply inherit `_base_/datasets/cityscapes_instance.py`. For runtime settings such as training schedules, the new config needs to inherit `_base_/default_runtime.py`. This configs are in the `configs` directory and the users can also choose to write the whole contents rather than use inheritance.
+
+```python
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py'
+]
+```
+
+## Modify head
+
+Then the new config needs to modify the head according to the class numbers of the new datasets. By only changing `num_classes` in the roi_head, the weights of the pre-trained models are mostly reused except the final prediction head.
+
+```python
+model = dict(
+    pretrained=None,
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=8,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=8,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+```
+
+## Modify dataset
+
+The users may also need to prepare the dataset and write the configs about dataset. MMDetection V2.0 already support VOC, WIDER FACE, COCO and Cityscapes Dataset.
+
+## Modify training schedule
+
+The finetuning hyperparameters vary from the default schedule. It usually requires smaller learning rate and less training epochs
+
+```python
+# optimizer
+# lr is set for a batch size of 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[7])
+# the max_epochs and step in lr_config need specifically tuned for the customized dataset
+runner = dict(max_epochs=8)
+log_config = dict(interval=100)
+```
+
+## Use pre-trained model
+
+To use the pre-trained model, the new config add the link of pre-trained models in the `load_from`. The users might need to download the model weights before training to avoid the download time during training.
+
+```python
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
+
+```
diff --git a/docs/en/tutorials/how_to.md b/docs/en/tutorials/how_to.md
new file mode 100755
index 0000000..c5184dc
--- /dev/null
+++ b/docs/en/tutorials/how_to.md
@@ -0,0 +1,204 @@
+# Tutorial 11: How to xxx
+
+This tutorial collects answers to any `How to xxx with MMDetection`. Feel free to update this doc if you meet new questions about `How to` and find the answers!
+
+## Use backbone network through MMClassification
+
+The model registry in MMDet, MMCls, MMSeg all inherit from the root registry in MMCV. This allows these repositories to directly use the modules already implemented by each other. Therefore, users can use backbone networks from MMClassification in MMDetection without implementing a network that already exists in MMClassification.
+
+### Use backbone network implemented in MMClassification
+
+Suppose you want to use `MobileNetV3-small` as the backbone network of `RetinaNet`, the example config is as the following.
+
+```python
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+pretrained = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True, # Delete the backbone field in _base_
+        type='mmcls.MobileNetV3', # Using MobileNetV3 from mmcls
+        arch='small',
+        out_indices=(3, 8, 11), # Modify out_indices
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=pretrained,
+            prefix='backbone.')), # The pre-trained weights of backbone network in MMCls have prefix='backbone.'. The prefix in the keys will be removed so that these weights can be normally loaded.
+    # Modify in_channels
+    neck=dict(in_channels=[24, 48, 96], start_level=0))
+```
+
+### Use backbone network in TIMM through MMClassification
+
+MMClassification also provides a wrapper for the PyTorch Image Models (timm) backbone network, users can directly use the backbone network in timm through MMClassification. Suppose you want to use EfficientNet-B1 as the backbone network of RetinaNet, the example config is as the following.
+
+```python
+# https://github.com/open-mmlab/mmdetection/blob/master/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
+
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True, # Delete the backbone field in _base_
+        type='mmcls.TIMMBackbone', # Using timm from mmcls
+        model_name='efficientnet_b1',
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)), # Modify out_indices
+    neck=dict(in_channels=[24, 40, 112, 320])) # Modify in_channels
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+```
+
+`type='mmcls.TIMMBackbone'` means use the `TIMMBackbone` class from MMClassification in MMDetection, and the model used is `EfficientNet-B1`, where `mmcls` means the MMClassification repo and `TIMMBackbone` means the TIMMBackbone wrapper implemented in MMClassification.
+
+For the principle of the Hierarchy Registry, please refer to the [MMCV document](https://github.com/open-mmlab/mmcv/blob/master/docs/en/understand_mmcv/registry.md#hierarchy-registry). For how to use other backbones in MMClassification, you can refer to the [MMClassification document](https://github.com/open-mmlab/mmclassification/blob/master/docs/en/tutorials/config.md).
+
+## Use Mosaic augmentation
+
+If you want to use `Mosaic` in training, please make sure that you use `MultiImageMixDataset` at the same time. Taking the 'Faster R-CNN' algorithm as an example, you should modify the values of `train_pipeline` and `train_dataset` in the config as below:
+
+```python
+# Open configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py directly and add the following fields
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+img_scale=(1333, 800)​
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)), # The image will be enlarged by 4 times after Mosaic processing,so we use affine transformation to restore the image size.
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+train_dataset = dict(
+    _delete_ = True, # remove unnecessary Settings
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_empty_gt=False,
+    ),
+    pipeline=train_pipeline
+    )
+​
+data = dict(
+    train=train_dataset
+    )
+```
+
+## Unfreeze backbone network after freezing the backbone in the config
+
+If you have freezed the backbone network in the config and want to unfreeze it after some epoches, you can write a hook function to do it.  Taking the Faster R-CNN with the resnet backbone as an example, you can freeze one stage of the backbone network and  add a `custom_hooks` in the config as below:
+
+```python
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    # freeze one stage of the backbone network.
+    backbone=dict(frozen_stages=1),
+)
+custom_hooks = [dict(type="UnfreezeBackboneEpochBasedHook", unfreeze_epoch=1)]
+```
+
+Meanwhile write the hook class `UnfreezeBackboneEpochBasedHook` in `mmdet/core/hook/unfreeze_backbone_epoch_based_hook.py`
+
+```python
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner.hooks import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class UnfreezeBackboneEpochBasedHook(Hook):
+    """Unfreeze backbone network Hook.
+
+    Args:
+        unfreeze_epoch (int): The epoch unfreezing the backbone network.
+    """
+
+    def __init__(self, unfreeze_epoch=1):
+        self.unfreeze_epoch = unfreeze_epoch
+
+    def before_train_epoch(self, runner):
+        # Unfreeze the backbone network.
+        # Only valid for resnet.
+        if runner.epoch == self.unfreeze_epoch:
+            model = runner.model
+            if is_module_wrapper(model):
+                model = model.module
+            backbone = model.backbone
+            if backbone.frozen_stages >= 0:
+                if backbone.deep_stem:
+                    backbone.stem.train()
+                    for param in backbone.stem.parameters():
+                        param.requires_grad = True
+                else:
+                    backbone.norm1.train()
+                    for m in [backbone.conv1, backbone.norm1]:
+                        for param in m.parameters():
+                            param.requires_grad = True
+
+            for i in range(1, backbone.frozen_stages + 1):
+                m = getattr(backbone, f'layer{i}')
+                m.train()
+                for param in m.parameters():
+                    param.requires_grad = True
+```
+
+## Get the channels of a new backbone
+
+If you want to get the channels of a new backbone, you can build this backbone alone and input a pseudo image to get each stage output.
+
+Take `ResNet` as an example:
+
+```python
+from mmdet.models import ResNet
+import torch
+self = ResNet(depth=18)
+self.eval()
+inputs = torch.rand(1, 3, 32, 32)
+level_outputs = self.forward(inputs)
+for level_out in level_outputs:
+    print(tuple(level_out.shape))
+
+```
+
+Output of the above script is as below:
+
+```python
+(1, 64, 8, 8)
+(1, 128, 4, 4)
+(1, 256, 2, 2)
+(1, 512, 1, 1)
+```
+
+Users can get the channels of the new backbone by Replacing the `ResNet(depth=18)` in this script with their customized backbone.
diff --git a/docs/en/tutorials/index.rst b/docs/en/tutorials/index.rst
new file mode 100755
index 0000000..5513611
--- /dev/null
+++ b/docs/en/tutorials/index.rst
@@ -0,0 +1,17 @@
+.. toctree::
+   :maxdepth: 2
+
+   config.md
+   customize_dataset.md
+   data_pipeline.md
+   customize_models.md
+   customize_runtime.md
+   customize_losses.md
+   finetune.md
+   robustness_benchmarking.md
+   pytorch2onnx.md
+   onnx2tensorrt.md
+   init_cfg.md
+   how_to.md
+   test_results_submission.md
+   useful_hooks.md
diff --git a/docs/en/tutorials/init_cfg.md b/docs/en/tutorials/init_cfg.md
new file mode 100755
index 0000000..b46b494
--- /dev/null
+++ b/docs/en/tutorials/init_cfg.md
@@ -0,0 +1,161 @@
+# Tutorial 10: Weight initialization
+
+During training, a proper initialization strategy is beneficial to speeding up the training or obtaining a higher performance. [MMCV](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/weight_init.py) provide some commonly used methods for initializing modules like `nn.Conv2d`. Model initialization in MMdetection mainly uses `init_cfg`. Users can initialize models with following two steps:
+
+1. Define `init_cfg` for a model or its components in `model_cfg`,  but `init_cfg` of children components have higher priority and will override `init_cfg` of parents modules.
+2. Build model as usual, but call `model.init_weights()` method explicitly, and model parameters will be initialized as configuration.
+
+The high-level workflow of initialization in MMdetection is :
+
+model_cfg(init_cfg) -> build_from_cfg -> model -> init_weight() -> initialize(self, self.init_cfg) -> children's init_weight()
+
+### Description
+
+It is dict or list\[dict\], and contains the following keys and values:
+
+- `type` (str), containing the initializer name in `INTIALIZERS`, and followed by arguments of the initializer.
+- `layer` (str or list\[str\]), containing the names of basiclayers in Pytorch or MMCV with learnable parameters that will be initialized, e.g. `'Conv2d'`,`'DeformConv2d'`.
+- `override` (dict or list\[dict\]),  containing the sub-modules that not inherit from BaseModule and whose initialization configuration is different from other layers' which are in `'layer'` key. Initializer defined in `type` will work for all layers defined in `layer`, so if sub-modules are not derived Classes of `BaseModule` but can be initialized as same ways of layers in `layer`, it does not need to use `override`. `override` contains:
+  - `type` followed by arguments of initializer;
+  - `name` to indicate sub-module which will be initialized.
+
+### Initialize parameters
+
+Inherit a new model from `mmcv.runner.BaseModule` or `mmdet.models`  Here we show an example of FooModel.
+
+```python
+import torch.nn as nn
+from mmcv.runner import BaseModule
+
+class FooModel(BaseModule)
+	def __init__(self,
+                 arg1,
+                 arg2,
+                 init_cfg=None):
+    	super(FooModel, self).__init__(init_cfg)
+		...
+```
+
+- Initialize model by using `init_cfg` directly in code
+
+  ```python
+  import torch.nn as nn
+  from mmcv.runner import BaseModule
+  # or directly inherit mmdet models
+
+  class FooModel(BaseModule)
+  	def __init__(self,
+                  arg1,
+                  arg2,
+                  init_cfg=XXX):
+  		super(FooModel, self).__init__(init_cfg)
+  	    ...
+  ```
+
+- Initialize model by using `init_cfg` directly in `mmcv.Sequential` or `mmcv.ModuleList` code
+
+  ```python
+  from mmcv.runner import BaseModule, ModuleList
+
+  class FooModel(BaseModule)
+  	def __init__(self,
+              	arg1,
+              	arg2,
+              	init_cfg=None):
+  		super(FooModel, self).__init__(init_cfg)
+      	...
+      	self.conv1 = ModuleList(init_cfg=XXX)
+  ```
+
+- Initialize model by using `init_cfg` in config file
+
+  ```python
+  model = dict(
+  	...
+  	model = dict(
+      	type='FooModel',
+      	arg1=XXX,
+      	arg2=XXX,
+      	init_cfg=XXX),
+          ...
+  ```
+
+### Usage of init_cfg
+
+1. Initialize model by `layer` key
+
+   If we only define `layer`, it just initialize the layer in `layer` key.
+
+   NOTE: Value of `layer` key is the class name with attributes weights and bias of Pytorch, (so such as  `MultiheadAttention layer` is not supported).
+
+- Define `layer` key for initializing module with same configuration.
+
+  ```python
+  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
+  # initialize whole module with same configuration
+  ```
+
+- Define `layer` key for initializing layer with different configurations.
+
+```python
+init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+            dict(type='Constant', layer='Conv2d', val=2),
+            dict(type='Constant', layer='Linear', val=3)]
+# nn.Conv1d will be initialized with dict(type='Constant', val=1)
+# nn.Conv2d will be initialized with dict(type='Constant', val=2)
+# nn.Linear will be initialized with dict(type='Constant', val=3)
+```
+
+2. Initialize model by `override` key
+
+- When initializing some specific part with its attribute name, we can use `override` key, and the value in `override` will ignore the value in init_cfg.
+
+  ```python
+  # layers：
+  # self.feat = nn.Conv1d(3, 1, 3)
+  # self.reg = nn.Conv2d(3, 3, 3)
+  # self.cls = nn.Linear(1,2)
+
+  init_cfg = dict(type='Constant',
+                  layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(type='Constant', name='reg', val=3, bias=4))
+  # self.feat and self.cls will be initialized with 	dict(type='Constant', val=1, bias=2)
+  # The module called 'reg' will be initialized with dict(type='Constant', val=3, bias=4)
+  ```
+
+- If `layer` is None in init_cfg, only sub-module with the name in override will be initialized, and type and other args in override can be omitted.
+
+  ```python
+  # layers：
+  # self.feat = nn.Conv1d(3, 1, 3)
+  # self.reg = nn.Conv2d(3, 3, 3)
+  # self.cls = nn.Linear(1,2)
+
+  init_cfg = dict(type='Constant', val=1, bias=2, 	override=dict(name='reg'))
+
+  # self.feat and self.cls will be initialized by Pytorch
+  # The module called 'reg' will be initialized with dict(type='Constant', val=1, bias=2)
+  ```
+
+- If we don't define `layer` key or `override` key, it will not initialize anything.
+
+- Invalid usage
+
+  ```python
+  # It is invalid that override don't have name key
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+              	override=dict(type='Constant', val=3, bias=4))
+
+  # It is also invalid that override has name and other args except type
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(name='reg', val=3, bias=4))
+  ```
+
+3. Initialize model with the pretrained model
+
+   ```python
+   init_cfg = dict(type='Pretrained',
+               checkpoint='torchvision://resnet50')
+   ```
+
+More details can refer to the documentation in [MMCV](https://mmcv.readthedocs.io/en/latest/cnn.html#weight-initialization) and MMCV [PR #780](https://github.com/open-mmlab/mmcv/pull/780)
diff --git a/docs/en/tutorials/onnx2tensorrt.md b/docs/en/tutorials/onnx2tensorrt.md
new file mode 100755
index 0000000..3848bb7
--- /dev/null
+++ b/docs/en/tutorials/onnx2tensorrt.md
@@ -0,0 +1,106 @@
+# Tutorial 9: ONNX to TensorRT (Experimental)
+
+> ## [Try the new MMDeploy to deploy your model](https://mmdeploy.readthedocs.io/)
+
+<!-- TOC -->
+
+- [Tutorial 9: ONNX to TensorRT (Experimental)](#tutorial-9-onnx-to-tensorrt-experimental)
+  - [How to convert models from ONNX to TensorRT](#how-to-convert-models-from-onnx-to-tensorrt)
+    - [Prerequisite](#prerequisite)
+    - [Usage](#usage)
+  - [How to evaluate the exported models](#how-to-evaluate-the-exported-models)
+  - [List of supported models convertible to TensorRT](#list-of-supported-models-convertible-to-tensorrt)
+  - [Reminders](#reminders)
+  - [FAQs](#faqs)
+
+<!-- TOC -->
+
+## How to convert models from ONNX to TensorRT
+
+### Prerequisite
+
+1. Please refer to [get_started.md](https://mmdetection.readthedocs.io/en/latest/get_started.html) for installation of MMCV and MMDetection from source.
+2. Please refer to [ONNXRuntime in mmcv](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_op.html) and [TensorRT plugin in mmcv](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/tensorrt_plugin.md/) to install `mmcv-full` with ONNXRuntime custom ops and TensorRT plugins.
+3. Use our tool [pytorch2onnx](https://mmdetection.readthedocs.io/en/latest/tutorials/pytorch2onnx.html) to convert the model from PyTorch to ONNX.
+
+### Usage
+
+```bash
+python tools/deployment/onnx2tensorrt.py \
+    ${CONFIG} \
+    ${MODEL} \
+    --trt-file ${TRT_FILE} \
+    --input-img ${INPUT_IMAGE_PATH} \
+    --shape ${INPUT_IMAGE_SHAPE} \
+    --min-shape ${MIN_IMAGE_SHAPE} \
+    --max-shape ${MAX_IMAGE_SHAPE} \
+    --workspace-size {WORKSPACE_SIZE} \
+    --show \
+    --verify \
+```
+
+Description of all arguments:
+
+- `config` : The path of a model config file.
+- `model` : The path of an ONNX model file.
+- `--trt-file`: The Path of output TensorRT engine file. If not specified, it will be set to `tmp.trt`.
+- `--input-img` : The path of an input image for tracing and conversion. By default, it will be set to `demo/demo.jpg`.
+- `--shape`: The height and width of model input. If not specified, it will be set to `400 600`.
+- `--min-shape`: The minimum height and width of model input. If not specified, it will be set to the same as `--shape`.
+- `--max-shape`: The maximum height and width of model input. If not specified, it will be set to the same as `--shape`.
+- `--workspace-size` : The required GPU workspace size in GiB to build TensorRT engine. If not specified, it will be set to `1` GiB.
+- `--show`: Determines whether to show the outputs of the model. If not specified, it will be set to `False`.
+- `--verify`: Determines whether to verify the correctness of models between ONNXRuntime and TensorRT. If not specified, it will be set to `False`.
+- `--verbose`: Determines whether to print logging messages. It's useful for debugging. If not specified, it will be set to `False`.
+
+Example:
+
+```bash
+python tools/deployment/onnx2tensorrt.py \
+    configs/retinanet/retinanet_r50_fpn_1x_coco.py \
+    checkpoints/retinanet_r50_fpn_1x_coco.onnx \
+    --trt-file checkpoints/retinanet_r50_fpn_1x_coco.trt \
+    --input-img demo/demo.jpg \
+    --shape 400 600 \
+    --show \
+    --verify \
+```
+
+## How to evaluate the exported models
+
+We prepare a tool `tools/deplopyment/test.py` to evaluate TensorRT models.
+
+Please refer to following links for more information.
+
+- [how-to-evaluate-the-exported-models](pytorch2onnx.md#how-to-evaluate-the-exported-models)
+- [results-and-models](pytorch2onnx.md#results-and-models)
+
+## List of supported models convertible to TensorRT
+
+The table below lists the models that are guaranteed to be convertible to TensorRT.
+
+|       Model        |                              Config                              | Dynamic Shape | Batch Inference | Note |
+| :----------------: | :--------------------------------------------------------------: | :-----------: | :-------------: | :--: |
+|        SSD         |                   `configs/ssd/ssd300_coco.py`                   |       Y       |        Y        |      |
+|        FSAF        |              `configs/fsaf/fsaf_r50_fpn_1x_coco.py`              |       Y       |        Y        |      |
+|        FCOS        |         `configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py`         |       Y       |        Y        |      |
+|       YOLOv3       |        `configs/yolo/yolov3_d53_mstrain-608_273e_coco.py`        |       Y       |        Y        |      |
+|     RetinaNet      |         `configs/retinanet/retinanet_r50_fpn_1x_coco.py`         |       Y       |        Y        |      |
+|    Faster R-CNN    |       `configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py`       |       Y       |        Y        |      |
+|   Cascade R-CNN    |      `configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py`      |       Y       |        Y        |      |
+|     Mask R-CNN     |         `configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py`         |       Y       |        Y        |      |
+| Cascade Mask R-CNN |   `configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py`    |       Y       |        Y        |      |
+|     PointRend      | `configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py` |       Y       |        Y        |      |
+
+Notes:
+
+- *All models above are tested with Pytorch==1.6.0, onnx==1.7.0 and TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0*
+
+## Reminders
+
+- If you meet any problem with the listed models above, please create an issue and it would be taken care of soon. For models not included in the list, we may not provide much help here due to the limited resources. Please try to dig a little deeper and debug by yourself.
+- Because this feature is experimental and may change fast, please always try with the latest `mmcv` and `mmdetecion`.
+
+## FAQs
+
+- None
diff --git a/docs/en/tutorials/pytorch2onnx.md b/docs/en/tutorials/pytorch2onnx.md
new file mode 100755
index 0000000..3561178
--- /dev/null
+++ b/docs/en/tutorials/pytorch2onnx.md
@@ -0,0 +1,334 @@
+# Tutorial 8: Pytorch to ONNX (Experimental)
+
+> ## [Try the new MMDeploy to deploy your model](https://mmdeploy.readthedocs.io/)
+
+<!-- TOC -->
+
+- [Tutorial 8: Pytorch to ONNX (Experimental)](#tutorial-8-pytorch-to-onnx-experimental)
+  - [How to convert models from Pytorch to ONNX](#how-to-convert-models-from-pytorch-to-onnx)
+    - [Prerequisite](#prerequisite)
+    - [Usage](#usage)
+    - [Description of all arguments](#description-of-all-arguments)
+  - [How to evaluate the exported models](#how-to-evaluate-the-exported-models)
+    - [Prerequisite](#prerequisite-1)
+    - [Usage](#usage-1)
+    - [Description of all arguments](#description-of-all-arguments-1)
+    - [Results and Models](#results-and-models)
+  - [List of supported models exportable to ONNX](#list-of-supported-models-exportable-to-onnx)
+  - [The Parameters of Non-Maximum Suppression in ONNX Export](#the-parameters-of-non-maximum-suppression-in-onnx-export)
+  - [Reminders](#reminders)
+  - [FAQs](#faqs)
+
+<!-- TOC -->
+
+## How to convert models from Pytorch to ONNX
+
+### Prerequisite
+
+1. Install the prerequisites following [get_started.md/Prepare environment](../get_started.md).
+2. Build custom operators for ONNX Runtime and install MMCV manually following [How to build custom operators for ONNX Runtime](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/onnxruntime_op.md/#how-to-build-custom-operators-for-onnx-runtime)
+3. Install MMdetection manually following steps 2-3 in [get_started.md/Install MMdetection](../get_started.md).
+
+### Usage
+
+```bash
+python tools/deployment/pytorch2onnx.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    --output-file ${OUTPUT_FILE} \
+    --input-img ${INPUT_IMAGE_PATH} \
+    --shape ${IMAGE_SHAPE} \
+    --test-img ${TEST_IMAGE_PATH} \
+    --opset-version ${OPSET_VERSION} \
+    --cfg-options ${CFG_OPTIONS}
+    --dynamic-export \
+    --show \
+    --verify \
+    --simplify \
+```
+
+### Description of all arguments
+
+- `config` : The path of a model config file.
+- `checkpoint` : The path of a model checkpoint file.
+- `--output-file`: The path of output ONNX model. If not specified, it will be set to `tmp.onnx`.
+- `--input-img`: The path of an input image for tracing and conversion. By default, it will be set to `tests/data/color.jpg`.
+- `--shape`: The height and width of input tensor to the model. If not specified, it will be set to `800 1216`.
+- `--test-img` : The path of an image to verify the exported ONNX model. By default, it will be set to `None`, meaning it will use `--input-img` for verification.
+- `--opset-version` : The opset version of ONNX. If not specified, it will be set to `11`.
+- `--dynamic-export`: Determines whether to export ONNX model with dynamic input and output shapes. If not specified, it will be set to `False`.
+- `--show`: Determines whether to print the architecture of the exported model and whether to show detection outputs when `--verify` is set to `True`. If not specified, it will be set to `False`.
+- `--verify`: Determines whether to verify the correctness of an exported model. If not specified, it will be set to `False`.
+- `--simplify`: Determines whether to simplify the exported ONNX model. If not specified, it will be set to `False`.
+- `--cfg-options`: Override some settings in the used config file, the key-value pair in `xxx=yyy` format will be merged into config file.
+- `--skip-postprocess`: Determines whether export model without post process. If not specified, it will be set to `False`. Notice: This is an experimental option. Only work for some single stage models. Users need to implement the post-process by themselves. We do not guarantee the correctness of the exported model.
+
+Example:
+
+```bash
+python tools/deployment/pytorch2onnx.py \
+    configs/yolo/yolov3_d53_mstrain-608_273e_coco.py \
+    checkpoints/yolo/yolov3_d53_mstrain-608_273e_coco.pth \
+    --output-file checkpoints/yolo/yolov3_d53_mstrain-608_273e_coco.onnx \
+    --input-img demo/demo.jpg \
+    --test-img tests/data/color.jpg \
+    --shape 608 608 \
+    --show \
+    --verify \
+    --dynamic-export \
+    --cfg-options \
+      model.test_cfg.deploy_nms_pre=-1 \
+```
+
+## How to evaluate the exported models
+
+We prepare a tool `tools/deplopyment/test.py` to evaluate ONNX models with ONNXRuntime and TensorRT.
+
+### Prerequisite
+
+- Install onnx and onnxruntime (CPU version)
+
+  ```shell
+  pip install onnx onnxruntime==1.5.1
+  ```
+
+- If you want to run the model on GPU, please remove the CPU version before using the GPU version.
+
+  ```shell
+  pip uninstall onnxruntime
+  pip install onnxruntime-gpu
+  ```
+
+  Note: onnxruntime-gpu is version-dependent on CUDA and CUDNN, please ensure that your
+  environment meets the requirements.
+
+- Build custom operators for ONNX Runtime following [How to build custom operators for ONNX Runtime](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/onnxruntime_op.md/#how-to-build-custom-operators-for-onnx-runtime)
+
+- Install TensorRT by referring to [How to build TensorRT plugins in MMCV](https://mmcv.readthedocs.io/en/latest/deployment/tensorrt_plugin.html#how-to-build-tensorrt-plugins-in-mmcv) (optional)
+
+### Usage
+
+```bash
+python tools/deployment/test.py \
+    ${CONFIG_FILE} \
+    ${MODEL_FILE} \
+    --out ${OUTPUT_FILE} \
+    --backend ${BACKEND} \
+    --format-only ${FORMAT_ONLY} \
+    --eval ${EVALUATION_METRICS} \
+    --show-dir ${SHOW_DIRECTORY} \
+    ----show-score-thr ${SHOW_SCORE_THRESHOLD} \
+    ----cfg-options ${CFG_OPTIONS} \
+    ----eval-options ${EVALUATION_OPTIONS} \
+```
+
+### Description of all arguments
+
+- `config`: The path of a model config file.
+- `model`: The path of an input model file.
+- `--out`: The path of output result file in pickle format.
+- `--backend`: Backend for input model to run and should be `onnxruntime` or `tensorrt`.
+- `--format-only` : Format the output results without perform evaluation. It is useful when you want to format the result to a specific format and submit it to the test server. If not specified, it will be set to `False`.
+- `--eval`: Evaluation metrics, which depends on the dataset, e.g., "bbox", "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC.
+- `--show-dir`: Directory where painted images will be saved
+- `--show-score-thr`: Score threshold. Default is set to `0.3`.
+- `--cfg-options`: Override some settings in the used config file, the key-value pair in `xxx=yyy` format will be merged into config file.
+- `--eval-options`: Custom options for evaluation, the key-value pair in `xxx=yyy` format will be kwargs for `dataset.evaluate()` function
+
+Notes:
+
+- If the deployed backend platform is TensorRT, please add environment variables before running the file:
+
+  ```bash
+  export ONNX_BACKEND=MMCVTensorRT
+  ```
+
+- If you want to use the `--dynamic-export` parameter in the TensorRT backend to export ONNX, please remove the `--simplify` parameter, and vice versa.
+
+### Results and Models
+
+<table border="1" class="docutils">
+	<tr>
+	    <th align="center">Model</th>
+	    <th align="center">Config</th>
+	    <th align="center">Metric</th>
+	    <th align="center">PyTorch</th>
+	    <th align="center">ONNX Runtime</th>
+	    <th align="center">TensorRT</th>
+	</tr >
+  <tr >
+	    <td align="center">FCOS</td>
+	    <td align="center"><code>configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">36.6</td>
+	    <td align="center">36.5</td>
+	    <td align="center">36.3</td>
+	</tr>
+  <tr >
+	    <td align="center">FSAF</td>
+	    <td align="center"><code>configs/fsaf/fsaf_r50_fpn_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">36.0</td>
+	    <td align="center">36.0</td>
+	    <td align="center">35.9</td>
+	</tr>
+  <tr >
+	    <td align="center">RetinaNet</td>
+	    <td align="center"><code>configs/retinanet/retinanet_r50_fpn_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">36.5</td>
+	    <td align="center">36.4</td>
+	    <td align="center">36.3</td>
+	</tr>
+	<tr >
+	    <td align="center" align="center" >SSD</td>
+	    <td align="center" align="center"><code>configs/ssd/ssd300_coco.py</code></td>
+	    <td align="center" align="center">Box AP</td>
+	    <td align="center" align="center">25.6</td>
+	    <td align="center" align="center">25.6</td>
+	    <td align="center" align="center">25.6</td>
+	</tr>
+  <tr >
+	    <td align="center">YOLOv3</td>
+	    <td align="center"><code>configs/yolo/yolov3_d53_mstrain-608_273e_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">33.5</td>
+	    <td align="center">33.5</td>
+	    <td align="center">33.5</td>
+	</tr>
+  <tr >
+	    <td align="center">Faster R-CNN</td>
+	    <td align="center"><code>configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">37.4</td>
+	    <td align="center">37.4</td>
+	    <td align="center">37.0</td>
+	</tr>
+  <tr >
+	    <td align="center">Cascade R-CNN</td>
+	    <td align="center"><code>configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">40.3</td>
+	    <td align="center">40.3</td>
+	    <td align="center">40.1</td>
+	</tr>
+
+<tr >
+	    <td align="center" rowspan="2">Mask R-CNN</td>
+	    <td align="center" rowspan="2"><code>configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">38.2</td>
+	    <td align="center">38.1</td>
+	    <td align="center">37.7</td>
+	</tr>
+	<tr>
+	    <td align="center">Mask AP</td>
+	    <td align="center">34.7</td>
+	    <td align="center">33.7</td>
+	    <td align="center">33.3</td>
+	</tr>
+  <tr >
+	    <td align="center" rowspan="2">Cascade Mask R-CNN</td>
+	    <td align="center" rowspan="2"><code>configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">41.2</td>
+	    <td align="center">41.2</td>
+	    <td align="center">40.9</td>
+	</tr>
+	<tr>
+	    <td align="center">Mask AP</td>
+	    <td align="center">35.9</td>
+	    <td align="center">34.8</td>
+	    <td align="center">34.5</td>
+	</tr>
+
+<tr >
+	    <td align="center">CornerNet</td>
+	    <td align="center"><code>configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">40.6</td>
+	    <td align="center">40.4</td>
+		<td align="center">-</td>
+	</tr>
+  <tr >
+	    <td align="center">DETR</td>
+	    <td align="center"><code>configs/detr/detr_r50_8x2_150e_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">40.1</td>
+	    <td align="center">40.1</td>
+		<td align="center">-</td>
+  </tr>
+  <tr >
+	    <td align="center" rowspan="2">PointRend</td>
+	    <td align="center" rowspan="2"><code>configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py</code></td>
+	    <td align="center">Box AP</td>
+	    <td align="center">38.4</td>
+	    <td align="center">38.4</td>
+	    <td align="center">-</td>
+  </tr>
+  <tr>
+	    <td align="center">Mask AP</td>
+	    <td align="center">36.3</td>
+	    <td align="center">35.2</td>
+	    <td align="center">-</td>
+  </tr>
+</table>
+
+Notes:
+
+- All ONNX models are evaluated with dynamic shape on coco dataset and images are preprocessed according to the original config file. Note that CornerNet is evaluated without test-time flip, since currently only single-scale evaluation is supported with ONNX Runtime.
+
+- Mask AP of Mask R-CNN drops by 1% for ONNXRuntime. The main reason is that the predicted masks are directly interpolated to original image in PyTorch, while they are at first interpolated to the preprocessed input image of the model and then to original image in other backend.
+
+## List of supported models exportable to ONNX
+
+The table below lists the models that are guaranteed to be exportable to ONNX and runnable in ONNX Runtime.
+
+|       Model        |                               Config                                | Dynamic Shape | Batch Inference |                                     Note                                      |
+| :----------------: | :-----------------------------------------------------------------: | :-----------: | :-------------: | :---------------------------------------------------------------------------: |
+|        FCOS        |      `configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py`       |       Y       |        Y        |                                                                               |
+|        FSAF        |               `configs/fsaf/fsaf_r50_fpn_1x_coco.py`                |       Y       |        Y        |                                                                               |
+|     RetinaNet      |          `configs/retinanet/retinanet_r50_fpn_1x_coco.py`           |       Y       |        Y        |                                                                               |
+|        SSD         |                    `configs/ssd/ssd300_coco.py`                     |       Y       |        Y        |                                                                               |
+|       YOLOv3       |         `configs/yolo/yolov3_d53_mstrain-608_273e_coco.py`          |       Y       |        Y        |                                                                               |
+|    Faster R-CNN    |        `configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py`         |       Y       |        Y        |                                                                               |
+|   Cascade R-CNN    |       `configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py`        |       Y       |        Y        |                                                                               |
+|     Mask R-CNN     |          `configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py`           |       Y       |        Y        |                                                                               |
+| Cascade Mask R-CNN |     `configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py`     |       Y       |        Y        |                                                                               |
+|     CornerNet      | `configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py` |       Y       |        N        | no flip, no batch inference, tested with torch==1.7.0 and onnxruntime==1.5.1. |
+|        DETR        |              `configs/detr/detr_r50_8x2_150e_coco.py`               |       Y       |        Y        |                     batch inference is *not recommended*                      |
+|     PointRend      |  `configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py`   |       Y       |        Y        |                                                                               |
+
+Notes:
+
+- Minimum required version of MMCV is `1.3.5`
+
+- *All models above are tested with Pytorch==1.6.0 and onnxruntime==1.5.1*, except for CornerNet. For more details about the
+  torch version when exporting CornerNet to ONNX, which involves `mmcv::cummax`, please refer to the [Known Issues](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/onnxruntime_op.md#known-issues) in mmcv.
+
+- Though supported, it is *not recommended* to use batch inference in onnxruntime for `DETR`, because there is huge performance gap between ONNX and torch model (e.g. 33.5 vs 39.9 mAP on COCO for onnxruntime and torch respectively, with a batch size 2). The main reason for the gap is that these is non-negligible effect on the predicted regressions during batch inference for ONNX, since the predicted coordinates is normalized by `img_shape` (without padding) and should be converted to absolute format, but `img_shape` is not dynamically traceable thus the padded `img_shape_for_onnx` is used.
+
+- Currently only single-scale evaluation is supported with ONNX Runtime, also `mmcv::SoftNonMaxSuppression` is only supported for single image by now.
+
+## The Parameters of Non-Maximum Suppression in ONNX Export
+
+In the process of exporting the ONNX model, we set some parameters for the NMS op to control the number of output bounding boxes. The following will introduce the parameter setting of the NMS op in the supported models. You can set these parameters through `--cfg-options`.
+
+- `nms_pre`: The number of boxes before NMS. The default setting is `1000`.
+
+- `deploy_nms_pre`: The number of boxes before NMS when exporting to ONNX model. The default setting is `0`.
+
+- `max_per_img`: The number of boxes to be kept after NMS. The default setting is `100`.
+
+- `max_output_boxes_per_class`: Maximum number of output boxes per class of NMS. The default setting is `200`.
+
+## Reminders
+
+- When the input model has custom op such as `RoIAlign` and if you want to verify the exported ONNX model, you may have to build `mmcv` with [ONNXRuntime](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_op.html) from source.
+- `mmcv.onnx.simplify` feature is based on [onnx-simplifier](https://github.com/daquexian/onnx-simplifier). If you want to try it, please refer to [onnx in `mmcv`](https://mmcv.readthedocs.io/en/latest/deployment/onnx.html) and [onnxruntime op in `mmcv`](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_op.html) for more information.
+- If you meet any problem with the listed models above, please create an issue and it would be taken care of soon. For models not included in the list, please try to dig a little deeper and debug a little bit more and hopefully solve them by yourself.
+- Because this feature is experimental and may change fast, please always try with the latest `mmcv` and `mmdetecion`.
+
+## FAQs
+
+- None
diff --git a/docs/en/tutorials/test_results_submission.md b/docs/en/tutorials/test_results_submission.md
new file mode 100755
index 0000000..aed595c
--- /dev/null
+++ b/docs/en/tutorials/test_results_submission.md
@@ -0,0 +1,112 @@
+# Tutorial 12: Test Results Submission
+
+## Panoptic segmentation test results submission
+
+The following sections introduce how to produce the prediction results of panoptic segmentation models on the COCO test-dev set and submit the predictions to [COCO evaluation server](https://competitions.codalab.org/competitions/19507).
+
+### Prerequisites
+
+- Download [COCO test dataset images](http://images.cocodataset.org/zips/test2017.zip), [testing image info](http://images.cocodataset.org/annotations/image_info_test2017.zip), and [panoptic train/val annotations](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip), then unzip them, put 'test2017' to `data/coco/`, put json files and annotation files to `data/coco/annotations/`.
+
+```shell
+# suppose data/coco/ does not exist
+mkdir -pv data/coco/
+
+# download test2017
+wget -P data/coco/ http://images.cocodataset.org/zips/test2017.zip
+wget -P data/coco/ http://images.cocodataset.org/annotations/image_info_test2017.zip
+wget -P data/coco/ http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip
+
+# unzip them
+unzip data/coco/test2017.zip -d data/coco/
+unzip data/coco/image_info_test2017.zip -d data/coco/
+unzip data/coco/panoptic_annotations_trainval2017.zip -d data/coco/
+
+# remove zip files (optional)
+rm -rf data/coco/test2017.zip data/coco/image_info_test2017.zip data/coco/panoptic_annotations_trainval2017.zip
+```
+
+- Run the following code to update category information in testing image info. Since the attribute `isthing` is missing in category information of 'image_info_test-dev2017.json', we need to update it with the category information in 'panoptic_val2017.json'.
+
+```shell
+python tools/misc/gen_coco_panoptic_test_info.py data/coco/annotations
+```
+
+After completing the above preparations, your directory structure of `data` should be like this:
+
+```text
+data
+`-- coco
+    |-- annotations
+    |   |-- image_info_test-dev2017.json
+    |   |-- image_info_test2017.json
+    |   |-- panoptic_image_info_test-dev2017.json
+    |   |-- panoptic_train2017.json
+    |   |-- panoptic_train2017.zip
+    |   |-- panoptic_val2017.json
+    |   `-- panoptic_val2017.zip
+    `-- test2017
+```
+
+### Inference on coco test-dev
+
+The commands to perform inference on test2017 are as below:
+
+```shell
+# test with single gpu
+CUDA_VISIBLE_DEVICES=0 python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=${WORK_DIR}/results
+
+# test with four gpus
+CUDA_VISIBLE_DEVICES=0,1,3,4 bash tools/dist_test.sh \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    4 \ # four gpus
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=${WORK_DIR}/results
+
+# test with slurm
+GPUS=8 tools/slurm_test.sh \
+    ${Partition} \
+    ${JOB_NAME} \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=${WORK_DIR}/results
+```
+
+Example
+
+Suppose we perform inference on `test2017` using pretrained MaskFormer with ResNet-50 backbone.
+
+```shell
+# test with single gpu
+CUDA_VISIBLE_DEVICES=0 python tools/test.py \
+    configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py \
+    checkpoints/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth \
+    --format-only \
+    --cfg-options data.test.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json data.test.img_prefix=data/coco/test2017 \
+    --eval-options jsonfile_prefix=work_dirs/maskformer/results
+```
+
+### Rename files and zip results
+
+After inference, the panoptic segmentation results (a json file and a directory where the masks are stored) will be in `WORK_DIR`. We should rename them according to the naming convention described on [COCO's Website](https://cocodataset.org/#upload). Finally, we need to compress the json and the directory where the masks are stored into a zip file, and rename the zip file according to the naming convention. Note that the zip file should **directly** contains the above two files.
+
+The commands to rename files and zip results:
+
+```shell
+# In WORK_DIR, we have panoptic segmentation results: 'panoptic' and 'results.panoptic.json'.
+cd ${WORK_DIR}
+
+# replace '[algorithm_name]' with the name of algorithm you used.
+mv ./panoptic ./panoptic_test-dev2017_[algorithm_name]_results
+mv ./results.panoptic.json ./panoptic_test-dev2017_[algorithm_name]_results.json
+zip panoptic_test-dev2017_[algorithm_name]_results.zip -ur panoptic_test-dev2017_[algorithm_name]_results panoptic_test-dev2017_[algorithm_name]_results.json
+```
diff --git a/docs/en/tutorials/useful_hooks.md b/docs/en/tutorials/useful_hooks.md
new file mode 100755
index 0000000..2c8bd55
--- /dev/null
+++ b/docs/en/tutorials/useful_hooks.md
@@ -0,0 +1,83 @@
+# Tutorial 13: Useful Hooks
+
+MMDetection and MMCV provide users with various useful hooks including log hooks, evaluation hooks, NumClassCheckHook, etc. This tutorial introduces the functionalities and usages of hooks implemented in MMDetection. For using hooks in MMCV, please read the [API documentation in MMCV](https://github.com/open-mmlab/mmcv/blob/master/docs/en/understand_mmcv/runner.md).
+
+## CheckInvalidLossHook
+
+## EvalHook and DistEvalHook
+
+## ExpMomentumEMAHook and LinearMomentumEMAHook
+
+## NumClassCheckHook
+
+## [MemoryProfilerHook](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/hook/memory_profiler_hook.py)
+
+Memory profiler hook records memory information including virtual memory, swap memory, and the memory of the current process. This hook helps grasp the memory usage of the system and discover potential memory leak bugs. To use this hook, users should install `memory_profiler` and `psutil` by `pip install memory_profiler psutil` first.
+
+### Usage
+
+To use this hook, users should add the following code to the config file.
+
+```python
+custom_hooks = [
+    dict(type='MemoryProfilerHook', interval=50)
+]
+```
+
+### Result
+
+During training, you can see the messages in the log recorded by `MemoryProfilerHook` as below. The system has 250 GB (246360 MB + 9407 MB) of memory and 8 GB (5740 MB + 2452 MB) of swap memory in total. Currently 9407 MB (4.4%) of memory and 5740 MB (29.9%) of swap memory were consumed. And the current training process consumed 5434 MB of memory.
+
+```text
+2022-04-21 08:49:56,881 - mmdet - INFO - Memory information available_memory: 246360 MB, used_memory: 9407 MB, memory_utilization: 4.4 %, available_swap_memory: 5740 MB, used_swap_memory: 2452 MB, swap_memory_utilization: 29.9 %, current_process_memory: 5434 MB
+```
+
+## SetEpochInfoHook
+
+## SyncNormHook
+
+## SyncRandomSizeHook
+
+## YOLOXLrUpdaterHook
+
+## YOLOXModeSwitchHook
+
+## How to implement a custom hook
+
+In general, there are 10 points where hooks can be inserted from the beginning to the end of model training. The users can implement custom hooks and insert them at different points in the process of training to do what they want.
+
+- global points: `before_run`, `after_run`
+- points in training: `before_train_epoch`, `before_train_iter`, `after_train_iter`, `after_train_epoch`
+- points in validation: `before_val_epoch`, `before_val_iter`, `after_val_iter`, `after_val_epoch`
+
+For example, users can implement a hook to check loss and terminate training when loss goes NaN. To achieve that, there are three steps to go:
+
+1. Implement a new hook that inherits the `Hook` class in MMCV, and implement `after_train_iter` method which checks whether loss goes NaN after every `n` training iterations.
+2. The implemented hook should be registered in `HOOKS` by `@HOOKS.register_module()` as shown in the code below.
+3. Add `custom_hooks = [dict(type='CheckInvalidLossHook', interval=50)]` in the config file.
+
+```python
+import torch
+from mmcv.runner.hooks import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class CheckInvalidLossHook(Hook):
+    """Check invalid loss hook.
+    This hook will regularly check whether the loss is valid
+    during training.
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval=50):
+        self.interval = interval
+
+    def after_train_iter(self, runner):
+        if self.every_n_iters(runner, self.interval):
+            assert torch.isfinite(runner.outputs['loss']), \
+                runner.logger.info('loss become infinite or NaN!')
+```
+
+Please read [customize_runtime](https://mmdetection.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-self-implemented-hooks) for more about implementing a custom hook.
diff --git a/docs/en/useful_tools.md b/docs/en/useful_tools.md
new file mode 100755
index 0000000..8eacd72
--- /dev/null
+++ b/docs/en/useful_tools.md
@@ -0,0 +1,589 @@
+Apart from training/testing scripts, We provide lots of useful tools under the
+`tools/` directory.
+
+## Log Analysis
+
+`tools/analysis_tools/analyze_logs.py` plots loss/mAP curves given a training
+log file. Run `pip install seaborn` first to install the dependency.
+
+```shell
+python tools/analysis_tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--eval-interval ${EVALUATION_INTERVAL}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
+```
+
+![loss curve image](../../resources/loss_curve.png)
+
+Examples:
+
+- Plot the classification loss of some run.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls
+  ```
+
+- Plot the classification and regression loss of some run, and save the figure to a pdf.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_bbox --out losses.pdf
+  ```
+
+- Compare the bbox mAP of two runs in the same figure.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log1.json log2.json --keys bbox_mAP --legend run1 run2
+  ```
+
+- Compute the average training speed.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py cal_train_time log.json [--include-outliers]
+  ```
+
+  The output is expected to be like the following.
+
+  ```text
+  -----Analyze train time of work_dirs/some_exp/20190611_192040.log.json-----
+  slowest epoch 11, average time is 1.2024
+  fastest epoch 1, average time is 1.1909
+  time std over epochs is 0.0028
+  average iter time: 1.1959 s/iter
+  ```
+
+## Result Analysis
+
+`tools/analysis_tools/analyze_results.py` calculates single image mAP and saves or shows the topk images with the highest and lowest scores based on prediction results.
+
+**Usage**
+
+```shell
+python tools/analysis_tools/analyze_results.py \
+      ${CONFIG} \
+      ${PREDICTION_PATH} \
+      ${SHOW_DIR} \
+      [--show] \
+      [--wait-time ${WAIT_TIME}] \
+      [--topk ${TOPK}] \
+      [--show-score-thr ${SHOW_SCORE_THR}] \
+      [--cfg-options ${CFG_OPTIONS}]
+```
+
+Description of all arguments:
+
+- `config` : The path of a model config file.
+- `prediction_path`:  Output result file in pickle format from `tools/test.py`
+- `show_dir`: Directory where painted GT and detection images will be saved
+- `--show`：Determines whether to show painted images, If not specified, it will be set to `False`
+- `--wait-time`: The interval of show (s), 0 is block
+- `--topk`: The number of saved images that have the highest and lowest `topk` scores after sorting. If not specified, it will be set to `20`.
+- `--show-score-thr`:  Show score threshold. If not specified, it will be set to `0`.
+- `--cfg-options`: If specified, the key-value pair optional cfg will be merged into config file
+
+**Examples**:
+
+Assume that you have got result file in pickle format from `tools/test.py`  in the path './result.pkl'.
+
+1. Test Faster R-CNN and visualize the results, save images to the directory `results/`
+
+```shell
+python tools/analysis_tools/analyze_results.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       result.pkl \
+       results \
+       --show
+```
+
+2. Test Faster R-CNN and specified topk to 50, save images to the directory `results/`
+
+```shell
+python tools/analysis_tools/analyze_results.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       result.pkl \
+       results \
+       --topk 50
+```
+
+3. If you want to filter the low score prediction results, you can specify the `show-score-thr` parameter
+
+```shell
+python tools/analysis_tools/analyze_results.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       result.pkl \
+       results \
+       --show-score-thr 0.3
+```
+
+## Visualization
+
+### Visualize Datasets
+
+`tools/misc/browse_dataset.py` helps the user to browse a detection dataset (both
+images and bounding box annotations) visually, or save the image to a
+designated directory.
+
+```shell
+python tools/misc/browse_dataset.py ${CONFIG} [-h] [--skip-type ${SKIP_TYPE[SKIP_TYPE...]}] [--output-dir ${OUTPUT_DIR}] [--not-show] [--show-interval ${SHOW_INTERVAL}]
+```
+
+### Visualize Models
+
+First, convert the model to ONNX as described
+[here](#convert-mmdetection-model-to-onnx-experimental).
+Note that currently only RetinaNet is supported, support for other models
+will be coming in later versions.
+The converted model could be visualized by tools like [Netron](https://github.com/lutzroeder/netron).
+
+### Visualize Predictions
+
+If you need a lightweight GUI for visualizing the detection results, you can refer [DetVisGUI project](https://github.com/Chien-Hung/DetVisGUI/tree/mmdetection).
+
+## Error Analysis
+
+`tools/analysis_tools/coco_error_analysis.py` analyzes COCO results per category and by
+different criterion. It can also make a plot to provide useful information.
+
+```shell
+python tools/analysis_tools/coco_error_analysis.py ${RESULT} ${OUT_DIR} [-h] [--ann ${ANN}] [--types ${TYPES[TYPES...]}]
+```
+
+Example:
+
+Assume that you have got [Mask R-CNN checkpoint file](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) in the path 'checkpoint'. For other checkpoints, please refer to our [model zoo](./model_zoo.md). You can use the following command to get the results bbox and segmentation json file.
+
+```shell
+# out: results.bbox.json and results.segm.json
+python tools/test.py \
+       configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py \
+       checkpoint/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+       --format-only \
+       --options "jsonfile_prefix=./results"
+```
+
+1. Get COCO bbox error results per category , save analyze result images to the directory `results/`
+
+```shell
+python tools/analysis_tools/coco_error_analysis.py \
+       results.bbox.json \
+       results \
+       --ann=data/coco/annotations/instances_val2017.json \
+```
+
+2. Get COCO segmentation error results per category , save analyze result images to the directory `results/`
+
+```shell
+python tools/analysis_tools/coco_error_analysis.py \
+       results.segm.json \
+       results \
+       --ann=data/coco/annotations/instances_val2017.json \
+       --types='segm'
+```
+
+## Model Serving
+
+In order to serve an `MMDetection` model with [`TorchServe`](https://pytorch.org/serve/), you can follow the steps:
+
+### 1. Convert model from MMDetection to TorchServe
+
+```shell
+python tools/deployment/mmdet2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
+--output-folder ${MODEL_STORE} \
+--model-name ${MODEL_NAME}
+```
+
+**Note**: ${MODEL_STORE} needs to be an absolute path to a folder.
+
+### 2. Build `mmdet-serve` docker image
+
+```shell
+docker build -t mmdet-serve:latest docker/serve/
+```
+
+### 3. Run `mmdet-serve`
+
+Check the official docs for [running TorchServe with docker](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment).
+
+In order to run in GPU, you need to install [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). You can omit the `--gpus` argument in order to run in CPU.
+
+Example:
+
+```shell
+docker run --rm \
+--cpus 8 \
+--gpus device=0 \
+-p8080:8080 -p8081:8081 -p8082:8082 \
+--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
+mmdet-serve:latest
+```
+
+[Read the docs](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md/) about the Inference (8080), Management (8081) and Metrics (8082) APis
+
+### 4. Test deployment
+
+```shell
+curl -O curl -O https://raw.githubusercontent.com/pytorch/serve/master/docs/images/3dogs.jpg
+curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T 3dogs.jpg
+```
+
+You should obtain a response similar to:
+
+```json
+[
+  {
+    "class_name": "dog",
+    "bbox": [
+      294.63409423828125,
+      203.99111938476562,
+      417.048583984375,
+      281.62744140625
+    ],
+    "score": 0.9987992644309998
+  },
+  {
+    "class_name": "dog",
+    "bbox": [
+      404.26019287109375,
+      126.0080795288086,
+      574.5091552734375,
+      293.6662292480469
+    ],
+    "score": 0.9979367256164551
+  },
+  {
+    "class_name": "dog",
+    "bbox": [
+      197.2144775390625,
+      93.3067855834961,
+      307.8505554199219,
+      276.7560119628906
+    ],
+    "score": 0.993338406085968
+  }
+]
+```
+
+And you can use `test_torchserver.py` to compare result of torchserver and pytorch, and visualize them.
+
+```shell
+python tools/deployment/test_torchserver.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME}
+[--inference-addr ${INFERENCE_ADDR}] [--device ${DEVICE}] [--score-thr ${SCORE_THR}]
+```
+
+Example:
+
+```shell
+python tools/deployment/test_torchserver.py \
+demo/demo.jpg \
+configs/yolo/yolov3_d53_320_273e_coco.py \
+checkpoint/yolov3_d53_320_273e_coco-421362b6.pth \
+yolov3
+```
+
+## Model Complexity
+
+`tools/analysis_tools/get_flops.py` is a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model.
+
+```shell
+python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
+```
+
+You will get the results like this.
+
+```text
+==============================
+Input shape: (3, 1280, 800)
+Flops: 239.32 GFLOPs
+Params: 37.74 M
+==============================
+```
+
+**Note**: This tool is still experimental and we do not guarantee that the
+number is absolutely correct. You may well use the result for simple
+comparisons, but double check it before you adopt it in technical reports or papers.
+
+1. FLOPs are related to the input shape while parameters are not. The default
+   input shape is (1, 3, 1280, 800).
+2. Some operators are not counted into FLOPs like GN and custom operators. Refer to [`mmcv.cnn.get_model_complexity_info()`](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/flops_counter.py) for details.
+3. The FLOPs of two-stage detectors is dependent on the number of proposals.
+
+## Model conversion
+
+### MMDetection model to ONNX (experimental)
+
+We provide a script to convert model to [ONNX](https://github.com/onnx/onnx) format. We also support comparing the output results between Pytorch and ONNX model for verification.
+
+```shell
+python tools/deployment/pytorch2onnx.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --output-file ${ONNX_FILE} [--shape ${INPUT_SHAPE} --verify]
+```
+
+**Note**: This tool is still experimental. Some customized operators are not supported for now. For a detailed description of the usage and the list of supported models, please refer to [pytorch2onnx](tutorials/pytorch2onnx.md).
+
+### MMDetection 1.x model to MMDetection 2.x
+
+`tools/model_converters/upgrade_model_version.py` upgrades a previous MMDetection checkpoint
+to the new version. Note that this script is not guaranteed to work as some
+breaking changes are introduced in the new version. It is recommended to
+directly use the new checkpoints.
+
+```shell
+python tools/model_converters/upgrade_model_version.py ${IN_FILE} ${OUT_FILE} [-h] [--num-classes NUM_CLASSES]
+```
+
+### RegNet model to MMDetection
+
+`tools/model_converters/regnet2mmdet.py` convert keys in pycls pretrained RegNet models to
+MMDetection style.
+
+```shell
+python tools/model_converters/regnet2mmdet.py ${SRC} ${DST} [-h]
+```
+
+### Detectron ResNet to Pytorch
+
+`tools/model_converters/detectron2pytorch.py` converts keys in the original detectron pretrained
+ResNet models to PyTorch style.
+
+```shell
+python tools/model_converters/detectron2pytorch.py ${SRC} ${DST} ${DEPTH} [-h]
+```
+
+### Prepare a model for publishing
+
+`tools/model_converters/publish_model.py` helps users to prepare their model for publishing.
+
+Before you upload a model to AWS, you may want to
+
+1. convert model weights to CPU tensors
+2. delete the optimizer states and
+3. compute the hash of the checkpoint file and append the hash id to the
+   filename.
+
+```shell
+python tools/model_converters/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+E.g.,
+
+```shell
+python tools/model_converters/publish_model.py work_dirs/faster_rcnn/latest.pth faster_rcnn_r50_fpn_1x_20190801.pth
+```
+
+The final output filename will be `faster_rcnn_r50_fpn_1x_20190801-{hash id}.pth`.
+
+## Dataset Conversion
+
+`tools/data_converters/` contains tools to convert the Cityscapes dataset
+and Pascal VOC dataset to the COCO format.
+
+```shell
+python tools/dataset_converters/cityscapes.py ${CITYSCAPES_PATH} [-h] [--img-dir ${IMG_DIR}] [--gt-dir ${GT_DIR}] [-o ${OUT_DIR}] [--nproc ${NPROC}]
+python tools/dataset_converters/pascal_voc.py ${DEVKIT_PATH} [-h] [-o ${OUT_DIR}]
+```
+
+## Dataset Download
+
+`tools/misc/download_dataset.py` supports downloading datasets such as COCO, VOC, and LVIS.
+
+```shell
+python tools/misc/download_dataset.py --dataset-name coco2017
+python tools/misc/download_dataset.py --dataset-name voc2007
+python tools/misc/download_dataset.py --dataset-name lvis
+```
+
+## Benchmark
+
+### Robust Detection Benchmark
+
+`tools/analysis_tools/test_robustness.py` and`tools/analysis_tools/robustness_eval.py`  helps users to evaluate model robustness. The core idea comes from [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484). For more information how to evaluate models on corrupted images and results for a set of standard models please refer to [robustness_benchmarking.md](robustness_benchmarking.md).
+
+### FPS Benchmark
+
+`tools/analysis_tools/benchmark.py` helps users to calculate FPS. The FPS value includes model forward and post-processing. In order to get a more accurate value, currently only supports single GPU distributed startup mode.
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=1 --master_port=${PORT} tools/analysis_tools/benchmark.py \
+    ${CONFIG} \
+    ${CHECKPOINT} \
+    [--repeat-num ${REPEAT_NUM}] \
+    [--max-iter ${MAX_ITER}] \
+    [--log-interval ${LOG_INTERVAL}] \
+    --launcher pytorch
+```
+
+Examples: Assuming that you have already downloaded the `Faster R-CNN` model checkpoint to the directory `checkpoints/`.
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=1 --master_port=29500 tools/analysis_tools/benchmark.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+       --launcher pytorch
+```
+
+## Miscellaneous
+
+### Evaluating a metric
+
+`tools/analysis_tools/eval_metric.py` evaluates certain metrics of a pkl result file
+according to a config file.
+
+```shell
+python tools/analysis_tools/eval_metric.py ${CONFIG} ${PKL_RESULTS} [-h] [--format-only] [--eval ${EVAL[EVAL ...]}]
+                      [--cfg-options ${CFG_OPTIONS [CFG_OPTIONS ...]}]
+                      [--eval-options ${EVAL_OPTIONS [EVAL_OPTIONS ...]}]
+```
+
+### Print the entire config
+
+`tools/misc/print_config.py` prints the whole config verbatim, expanding all its
+imports.
+
+```shell
+python tools/misc/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}]
+```
+
+## Hyper-parameter Optimization
+
+### YOLO Anchor Optimization
+
+`tools/analysis_tools/optimize_anchors.py` provides two method to optimize YOLO anchors.
+
+One is k-means anchor cluster which refers from [darknet](https://github.com/AlexeyAB/darknet/blob/master/src/detector.c#L1421).
+
+```shell
+python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm k-means --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR}
+```
+
+Another is using differential evolution to optimize anchors.
+
+```shell
+python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm differential_evolution --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR}
+```
+
+E.g.,
+
+```shell
+python tools/analysis_tools/optimize_anchors.py configs/yolo/yolov3_d53_320_273e_coco.py --algorithm differential_evolution --input-shape 608 608 --device cuda --output-dir work_dirs
+```
+
+You will get:
+
+```
+loading annotations into memory...
+Done (t=9.70s)
+creating index...
+index created!
+2021-07-19 19:37:20,951 - mmdet - INFO - Collecting bboxes from annotation...
+[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 117266/117266, 15874.5 task/s, elapsed: 7s, ETA:     0s
+
+2021-07-19 19:37:28,753 - mmdet - INFO - Collected 849902 bboxes.
+differential_evolution step 1: f(x)= 0.506055
+differential_evolution step 2: f(x)= 0.506055
+......
+
+differential_evolution step 489: f(x)= 0.386625
+2021-07-19 19:46:40,775 - mmdet - INFO Anchor evolution finish. Average IOU: 0.6133754253387451
+2021-07-19 19:46:40,776 - mmdet - INFO Anchor differential evolution result:[[10, 12], [15, 30], [32, 22], [29, 59], [61, 46], [57, 116], [112, 89], [154, 198], [349, 336]]
+2021-07-19 19:46:40,798 - mmdet - INFO Result saved in work_dirs/anchor_optimize_result.json
+```
+
+## Confusion Matrix
+
+A confusion matrix is a summary of prediction results.
+
+`tools/analysis_tools/confusion_matrix.py` can analyze the prediction results and plot a confusion matrix table.
+
+First, run `tools/test.py` to save the `.pkl` detection results.
+
+Then, run
+
+```
+python tools/analysis_tools/confusion_matrix.py ${CONFIG}  ${DETECTION_RESULTS}  ${SAVE_DIR} --show
+```
+
+And you will get a confusion matrix like this:
+
+![confusion_matrix_example](https://user-images.githubusercontent.com/12907710/140513068-994cdbf4-3a4a-48f0-8fd8-2830d93fd963.png)
+
+## COCO Separated & Occluded Mask Metric
+
+Detecting occluded objects still remains a challenge for state-of-the-art object detectors.
+We implemented the metric presented in paper [A Tri-Layer Plugin to Improve Occluded Detection](https://arxiv.org/abs/2210.10046) to calculate the recall of separated and occluded masks.
+
+There are two ways to use this metric:
+
+### Offline evaluation
+
+We provide a script to calculate the metric with a dumped prediction file.
+
+First, use the `tools/test.py` script to dump the detection results:
+
+```shell
+python tools/test.py ${CONFIG} ${MODEL_PATH} --out results.pkl
+```
+
+Then, run the `tools/analysis_tools/coco_occluded_separated_recall.py` script to get the recall of separated and occluded masks:
+
+```shell
+python tools/analysis_tools/coco_occluded_separated_recall.py results.pkl --out occluded_separated_recall.json
+```
+
+The output should be like this:
+
+```
+loading annotations into memory...
+Done (t=0.51s)
+creating index...
+index created!
+processing detection results...
+[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 5000/5000, 109.3 task/s, elapsed: 46s, ETA:     0s
+computing occluded mask recall...
+[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 5550/5550, 780.5 task/s, elapsed: 7s, ETA:     0s
+COCO occluded mask recall: 58.79%
+COCO occluded mask success num: 3263
+computing separated mask recall...
+[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 3522/3522, 778.3 task/s, elapsed: 5s, ETA:     0s
+COCO separated mask recall: 31.94%
+COCO separated mask success num: 1125
+
++-----------+--------+-------------+
+| mask type | recall | num correct |
++-----------+--------+-------------+
+| occluded  | 58.79% | 3263        |
+| separated | 31.94% | 1125        |
++-----------+--------+-------------+
+Evaluation results have been saved to occluded_separated_recall.json.
+```
+
+### Online evaluation
+
+We implement `OccludedSeparatedCocoDataset` which inherited from the `CocoDataset`.
+To evaluate the recall of separated and occluded masks during training, just replace the validation dataset type with `'OccludedSeparatedCocoDataset'` in your config:
+
+```python
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type='OccludedSeparatedCocoDataset',  # modify this
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type='OccludedSeparatedCocoDataset',  # modify this
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+```
+
+Please cite the paper if you use this metric:
+
+```latex
+@article{zhan2022triocc,
+    title={A Tri-Layer Plugin to Improve Occluded Detection},
+    author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew},
+    journal={British Machine Vision Conference},
+    year={2022}
+}
+```
diff --git a/docs/zh_cn/1_exist_data_model.md b/docs/zh_cn/1_exist_data_model.md
new file mode 100755
index 0000000..e349343
--- /dev/null
+++ b/docs/zh_cn/1_exist_data_model.md
@@ -0,0 +1,678 @@
+# 1: 使用已有模型在标准数据集上进行推理
+
+MMDetection 在 [Model Zoo](https://mmdetection.readthedocs.io/en/latest/model_zoo.html) 中提供了数以百计的检测模型，并支持多种标准数据集，包括 Pascal VOC，COCO，Cityscapes，LVIS 等。这份文档将会讲述如何使用这些模型和标准数据集来运行一些常见的任务，包括：
+
+- 使用现有模型在给定图片上进行推理
+- 在标准数据集上测试现有模型
+- 在标准数据集上训练预定义的模型
+
+## 使用现有模型进行推理
+
+推理是指使用训练好的模型来检测图像上的目标。在 MMDetection 中，一个模型被定义为一个配置文件和对应的存储在 checkpoint 文件内的模型参数的集合。
+
+首先，我们建议从 [Faster RCNN](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn) 开始，其 [配置](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py) 文件和 [checkpoint](http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) 文件在此。
+我们建议将 checkpoint 文件下载到 `checkpoints` 文件夹内。
+
+### 推理的高层编程接口
+
+MMDetection 为在图片上推理提供了 Python 的高层编程接口。下面是建立模型和在图像或视频上进行推理的例子。
+
+```python
+from mmdet.apis import init_detector, inference_detector
+import mmcv
+
+# 指定模型的配置文件和 checkpoint 文件路径
+config_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+
+# 根据配置文件和 checkpoint 文件构建模型
+model = init_detector(config_file, checkpoint_file, device='cuda:0')
+
+# 测试单张图片并展示结果
+img = 'test.jpg'  # 或者 img = mmcv.imread(img)，这样图片仅会被读一次
+result = inference_detector(model, img)
+# 在一个新的窗口中将结果可视化
+model.show_result(img, result)
+# 或者将可视化结果保存为图片
+model.show_result(img, result, out_file='result.jpg')
+
+# 测试视频并展示结果
+video = mmcv.VideoReader('video.mp4')
+for frame in video:
+    result = inference_detector(model, frame)
+    model.show_result(frame, result, wait_time=1)
+```
+
+jupyter notebook 上的演示样例在 [demo/inference_demo.ipynb](https://github.com/open-mmlab/mmdetection/blob/master/demo/inference_demo.ipynb) 。
+
+### 异步接口-支持 Python 3.7+
+
+对于 Python 3.7+，MMDetection 也有异步接口。利用 CUDA 流，绑定 GPU 的推理代码不会阻塞 CPU，从而使得 CPU/GPU 在单线程应用中能达到更高的利用率。在推理流程中，不同数据样本的推理和不同模型的推理都能并发地运行。
+
+您可以参考 `tests/async_benchmark.py` 来对比同步接口和异步接口的运行速度。
+
+```python
+import asyncio
+import torch
+from mmdet.apis import init_detector, async_inference_detector
+from mmdet.utils.contextmanagers import concurrent
+
+async def main():
+    config_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+    checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+    device = 'cuda:0'
+    model = init_detector(config_file, checkpoint=checkpoint_file, device=device)
+
+    # 此队列用于并行推理多张图像
+    streamqueue = asyncio.Queue()
+    # 队列大小定义了并行的数量
+    streamqueue_size = 3
+
+    for _ in range(streamqueue_size):
+        streamqueue.put_nowait(torch.cuda.Stream(device=device))
+
+    # 测试单张图片并展示结果
+    img = 'test.jpg'  # or 或者 img = mmcv.imread(img)，这样图片仅会被读一次
+
+    async with concurrent(streamqueue):
+        result = await async_inference_detector(model, img)
+
+    # 在一个新的窗口中将结果可视化
+    model.show_result(img, result)
+    # 或者将可视化结果保存为图片
+    model.show_result(img, result, out_file='result.jpg')
+
+
+asyncio.run(main())
+
+```
+
+### 演示样例
+
+我们还提供了三个演示脚本，它们是使用高层编程接口实现的。 [源码在此](https://github.com/open-mmlab/mmdetection/tree/master/demo) 。
+
+#### 图片样例
+
+这是在单张图片上进行推理的脚本，可以开启 `--async-test` 来进行异步推理。
+
+```shell
+python demo/image_demo.py \
+    ${IMAGE_FILE} \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--device ${GPU_ID}] \
+    [--score-thr ${SCORE_THR}] \
+    [--async-test]
+```
+
+运行样例：
+
+```shell
+python demo/image_demo.py demo/demo.jpg \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+    --device cpu
+```
+
+#### 摄像头样例
+
+这是使用摄像头实时图片的推理脚本。
+
+```shell
+python demo/webcam_demo.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--device ${GPU_ID}] \
+    [--camera-id ${CAMERA-ID}] \
+    [--score-thr ${SCORE_THR}]
+```
+
+运行样例：
+
+```shell
+python demo/webcam_demo.py \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
+```
+
+#### 视频样例
+
+这是在视频样例上进行推理的脚本。
+
+```shell
+python demo/video_demo.py \
+    ${VIDEO_FILE} \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--device ${GPU_ID}] \
+    [--score-thr ${SCORE_THR}] \
+    [--out ${OUT_FILE}] \
+    [--show] \
+    [--wait-time ${WAIT_TIME}]
+```
+
+运行样例：
+
+```shell
+python demo/video_demo.py demo/demo.mp4 \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+    --out result.mp4
+```
+
+#### 视频样例，显卡加速版本
+
+这是在视频样例上进行推理的脚本，使用显卡加速。
+
+```shell
+python demo/video_gpuaccel_demo.py \
+     ${VIDEO_FILE} \
+     ${CONFIG_FILE} \
+     ${CHECKPOINT_FILE} \
+     [--device ${GPU_ID}] \
+     [--score-thr ${SCORE_THR}] \
+     [--nvdecode] \
+     [--out ${OUT_FILE}] \
+     [--show] \
+     [--wait-time ${WAIT_TIME}]
+
+```
+
+运行样例：
+
+```shell
+python demo/video_gpuaccel_demo.py demo/demo.mp4 \
+    configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+    --nvdecode --out result.mp4
+```
+
+## 在标准数据集上测试现有模型
+
+为了测试一个模型的精度，我们通常会在标准数据集上对其进行测试。MMDetection 支持多个公共数据集，包括 [COCO](https://cocodataset.org/) ，
+[Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC) ，[Cityscapes](https://www.cityscapes-dataset.com/) 等等。
+这一部分将会介绍如何在支持的数据集上测试现有模型。
+
+### 数据集准备
+
+一些公共数据集，比如 Pascal VOC 及其镜像数据集，或者 COCO 等数据集都可以从官方网站或者镜像网站获取。
+注意：在检测任务中，Pascal VOC 2012 是 Pascal VOC 2007 的无交集扩展，我们通常将两者一起使用。
+我们建议将数据集下载，然后解压到项目外部的某个文件夹内，然后通过符号链接的方式，将数据集根目录链接到 `$MMDETECTION/data` 文件夹下，格式如下所示。
+如果你的文件夹结构和下方不同的话，你需要在配置文件中改变对应的路径。
+我们提供了下载 COCO 等数据集的脚本，你可以运行 `python tools/misc/download_dataset.py --dataset-name coco2017` 下载 COCO 数据集。
+
+```plain
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   ├── cityscapes
+│   │   ├── annotations
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── VOCdevkit
+│   │   ├── VOC2007
+│   │   ├── VOC2012
+```
+
+有些模型需要额外的 [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) 数据集，比如 HTC，DetectoRS 和 SCNet，你可以下载并解压它们到 `coco` 文件夹下。文件夹会是如下结构：
+
+```plain
+mmdetection
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   │   ├── stuffthingmaps
+```
+
+PanopticFPN 等全景分割模型需要额外的 [COCO Panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) 数据集，你可以下载并解压它们到 `coco/annotations` 文件夹下。文件夹会是如下结构：
+
+```text
+mmdetection
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── panoptic_train2017.json
+│   │   │   ├── panoptic_train2017
+│   │   │   ├── panoptic_val2017.json
+│   │   │   ├── panoptic_val2017
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+```
+
+Cityscape 数据集的标注格式需要转换，以与 COCO 数据集标注格式保持一致，使用 `tools/dataset_converters/cityscapes.py` 来完成转换：
+
+```shell
+pip install cityscapesscripts
+
+python tools/dataset_converters/cityscapes.py \
+    ./data/cityscapes \
+    --nproc 8 \
+    --out-dir ./data/cityscapes/annotations
+```
+
+### 测试现有模型
+
+我们提供了测试脚本，能够测试一个现有模型在所有数据集（COCO，Pascal VOC，Cityscapes 等）上的性能。我们支持在如下环境下测试：
+
+- 单 GPU 测试
+- CPU 测试
+- 单节点多 GPU 测试
+- 多节点测试
+
+根据以上测试环境，选择合适的脚本来执行测试过程。
+
+```shell
+# 单 GPU 测试
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}] \
+    [--show]
+
+# CPU 测试：禁用 GPU 并运行单 GPU 测试脚本
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}] \
+    [--show]
+
+# 单节点多 GPU 测试
+bash tools/dist_test.sh \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    ${GPU_NUM} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}]
+```
+
+`tools/dist_test.sh` 也支持多节点测试，不过需要依赖 PyTorch 的 [启动工具](https://pytorch.org/docs/stable/distributed.html#launch-utility) 。
+
+可选参数：
+
+- `RESULT_FILE`: 结果文件名称，需以 .pkl 形式存储。如果没有声明，则不将结果存储到文件。
+- `EVAL_METRICS`: 需要测试的度量指标。可选值是取决于数据集的，比如 `proposal_fast`，`proposal`，`bbox`，`segm` 是 COCO 数据集的可选值，`mAP`，`recall` 是 Pascal VOC 数据集的可选值。Cityscapes 数据集可以测试 `cityscapes` 和所有 COCO 数据集支持的度量指标。
+- `--show`: 如果开启，检测结果将被绘制在图像上，以一个新窗口的形式展示。它只适用于单 GPU 的测试，是用于调试和可视化的。请确保使用此功能时，你的 GUI 可以在环境中打开。否则，你可能会遇到这么一个错误 `cannot connect to X server`。
+- `--show-dir`: 如果指明，检测结果将会被绘制在图像上并保存到指定目录。它只适用于单 GPU 的测试，是用于调试和可视化的。即使你的环境中没有 GUI，这个选项也可使用。
+- `--show-score-thr`: 如果指明，得分低于此阈值的检测结果将会被移除。
+- `--cfg-options`:  如果指明，这里的键值对将会被合并到配置文件中。
+- `--eval-options`: 如果指明，这里的键值对将会作为字典参数被传入 `dataset.evaluation()` 函数中，仅在测试阶段使用。
+
+### 样例
+
+假设你已经下载了 checkpoint 文件到 `checkpoints/` 文件下了。
+
+1. 测试 Faster R-CNN 并可视化其结果。按任意键继续下张图片的测试。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn) 。
+
+   ```shell
+   python tools/test.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+       --show
+   ```
+
+2. 测试 Faster R-CNN，并为了之后的可视化保存绘制的图像。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/master/configs/faster_rcnn) 。
+
+   ```shell
+   python tools/test.py \
+       configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
+       --show-dir faster_rcnn_r50_fpn_1x_results
+   ```
+
+3. 在 Pascal VOC 数据集上测试 Faster R-CNN，不保存测试结果，测试 `mAP`。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/master/configs/pascal_voc) 。
+
+   ```shell
+   python tools/test.py \
+       configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc.py \
+       checkpoints/faster_rcnn_r50_fpn_1x_voc0712_20200624-c9895d40.pth \
+       --eval mAP
+   ```
+
+4. 使用 8 块 GPU 测试 Mask R-CNN，测试 `bbox` 和 `mAP` 。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn) 。
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/mask_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+       8 \
+       --out results.pkl \
+       --eval bbox segm
+   ```
+
+5. 使用 8 块 GPU 测试 Mask R-CNN，测试**每类**的 `bbox` 和 `mAP`。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn) 。
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+       8 \
+       --out results.pkl \
+       --eval bbox segm \
+       --options "classwise=True"
+   ```
+
+6. 在 COCO test-dev 数据集上，使用 8 块 GPU 测试 Mask R-CNN，并生成 JSON 文件提交到官方评测服务器。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/master/configs/mask_rcnn) 。
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+       8 \
+       --format-only \
+       --options "jsonfile_prefix=./mask_rcnn_test-dev_results"
+   ```
+
+这行命令生成两个 JSON 文件 `mask_rcnn_test-dev_results.bbox.json` 和 `mask_rcnn_test-dev_results.segm.json`。
+
+7. 在 Cityscapes 数据集上，使用 8 块 GPU 测试 Mask R-CNN，生成 txt 和 png 文件，并上传到官方评测服务器。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/master/configs/cityscapes) 。
+
+   ```shell
+   ./tools/dist_test.sh \
+       configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py \
+       checkpoints/mask_rcnn_r50_fpn_1x_cityscapes_20200227-afe51d5a.pth \
+       8 \
+       --format-only \
+       --options "txtfile_prefix=./mask_rcnn_cityscapes_test_results"
+   ```
+
+生成的 png 和 txt 文件在 `./mask_rcnn_cityscapes_test_results` 文件夹下。
+
+### 不使用 Ground Truth 标注进行测试
+
+MMDetection 支持在不使用 ground-truth 标注的情况下对模型进行测试，这需要用到 `CocoDataset`。如果你的数据集格式不是 COCO 格式的，请将其转化成 COCO 格式。如果你的数据集格式是 VOC 或者 Cityscapes，你可以使用 [tools/dataset_converters](https://github.com/open-mmlab/mmdetection/tree/master/tools/dataset_converters) 内的脚本直接将其转化成 COCO 格式。如果是其他格式，可以使用 [images2coco 脚本](https://github.com/open-mmlab/mmdetection/tree/master/tools/dataset_converters/images2coco.py) 进行转换。
+
+```shell
+python tools/dataset_converters/images2coco.py \
+    ${IMG_PATH} \
+    ${CLASSES} \
+    ${OUT} \
+    [--exclude-extensions]
+```
+
+参数：
+
+- `IMG_PATH`: 图片根路径。
+- `CLASSES`: 类列表文本文件名。文本中每一行存储一个类别。
+- `OUT`: 输出 json 文件名。 默认保存目录和 `IMG_PATH` 在同一级。
+- `exclude-extensions`: 待排除的文件后缀名。
+
+在转换完成后，使用如下命令进行测试
+
+```shell
+# 单 GPU 测试
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    --format-only \
+    --options ${JSONFILE_PREFIX} \
+    [--show]
+
+# CPU 测试：禁用 GPU 并运行单 GPU 测试脚本
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    [--out ${RESULT_FILE}] \
+    [--eval ${EVAL_METRICS}] \
+    [--show]
+
+# 单节点多 GPU 测试
+bash tools/dist_test.sh \
+    ${CONFIG_FILE} \
+    ${CHECKPOINT_FILE} \
+    ${GPU_NUM} \
+    --format-only \
+    --options ${JSONFILE_PREFIX} \
+    [--show]
+```
+
+假设 [model zoo](https://mmdetection.readthedocs.io/en/latest/modelzoo_statistics.html) 中的 checkpoint 文件被下载到了 `checkpoints/` 文件夹下，
+我们可以使用以下命令，用 8 块 GPU 在 COCO test-dev 数据集上测试 Mask R-CNN，并且生成 JSON 文件。
+
+```sh
+./tools/dist_test.sh \
+    configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py \
+    checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \
+    8 \
+    -format-only \
+    --options "jsonfile_prefix=./mask_rcnn_test-dev_results"
+```
+
+这行命令生成两个 JSON 文件 `mask_rcnn_test-dev_results.bbox.json` 和 `mask_rcnn_test-dev_results.segm.json`。
+
+### 批量推理
+
+MMDetection 在测试模式下，既支持单张图片的推理，也支持对图像进行批量推理。默认情况下，我们使用单张图片的测试，你可以通过修改测试数据配置文件中的 `samples_per_gpu` 来开启批量测试。
+开启批量推理的配置文件修改方法为：
+
+```shell
+data = dict(train=dict(...), val=dict(...), test=dict(samples_per_gpu=2, ...))
+```
+
+或者你可以通过将 `--cfg-options` 设置为 `--cfg-options data.test.samples_per_gpu=2` 来开启它。
+
+### 弃用 ImageToTensor
+
+在测试模式下，弃用 `ImageToTensor` 流程，取而代之的是 `DefaultFormatBundle`。建议在你的测试数据流程的配置文件中手动替换它，如：
+
+```python
+# （已弃用）使用 ImageToTensor
+pipelines = [
+   dict(type='LoadImageFromFile'),
+   dict(
+       type='MultiScaleFlipAug',
+       img_scale=(1333, 800),
+       flip=False,
+       transforms=[
+           dict(type='Resize', keep_ratio=True),
+           dict(type='RandomFlip'),
+           dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+           dict(type='Pad', size_divisor=32),
+           dict(type='ImageToTensor', keys=['img']),
+           dict(type='Collect', keys=['img']),
+       ])
+   ]
+
+# （建议使用）手动将 ImageToTensor 替换为 DefaultFormatBundle
+pipelines = [
+   dict(type='LoadImageFromFile'),
+   dict(
+       type='MultiScaleFlipAug',
+       img_scale=(1333, 800),
+       flip=False,
+       transforms=[
+           dict(type='Resize', keep_ratio=True),
+           dict(type='RandomFlip'),
+           dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+           dict(type='Pad', size_divisor=32),
+           dict(type='DefaultFormatBundle'),
+           dict(type='Collect', keys=['img']),
+       ])
+   ]
+```
+
+## 在标准数据集上训练预定义的模型
+
+MMDetection 也为训练检测模型提供了开盖即食的工具。本节将展示在标准数据集（比如 COCO）上如何训练一个预定义的模型。
+
+### 数据集
+
+训练需要准备好数据集，细节请参考 [数据集准备](#%E6%95%B0%E6%8D%AE%E9%9B%86%E5%87%86%E5%A4%87) 。
+
+**注意**：
+目前，`configs/cityscapes` 文件夹下的配置文件都是使用 COCO 预训练权值进行初始化的。如果网络连接不可用或者速度很慢，你可以提前下载现存的模型。否则可能在训练的开始会有错误发生。
+
+### 学习率自动缩放
+
+**注意**：在配置文件中的学习率是在 8 块 GPU，每块 GPU 有 2 张图像（批大小为 8\*2=16）的情况下设置的。其已经设置在`config/_base_/default_runtime.py` 中的 `auto_scale_lr.base_batch_size`。当配置文件的批次大小为`16`时，学习率会基于该值进行自动缩放。同时，为了不影响其他基于 mmdet 的 codebase，启用自动缩放标志 `auto_scale_lr.enable` 默认设置为 `False`。
+
+如果要启用此功能，需在命令添加参数 `--auto-scale-lr`。并且在启动命令之前，请检查下即将使用的配置文件的名称，因为配置名称指示默认的批处理大小。
+在默认情况下，批次大小是 `8 x 2 = 16`，例如：`faster_rcnn_r50_caffe_fpn_90k_coco.py` 或者 `pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py`；若不是默认批次，你可以在配置文件看到像 `_NxM_` 字样的，例如：`cornernet_hourglass104_mstest_32x3_210e_coco.py` 的批次大小是 `32 x 3 = 96`, 或者 `scnet_x101_64x4d_fpn_8x1_20e_coco.py` 的批次大小是 `8 x 1 = 8`。
+
+**请记住：如果使用不是默认批次大小为`16`的配置文件，请检查配置文件中的底部，会有 `auto_scale_lr.base_batch_size`。如果找不到，可以在其继承的 `_base_=[xxx]` 文件中找到。另外，如果想使用自动缩放学习率的功能，请不要修改这些值。**
+
+学习率自动缩放基本用法如下：
+
+```shell
+python tools/train.py \
+    ${CONFIG_FILE} \
+    --auto-scale-lr \
+    [optional arguments]
+```
+
+执行命令之后，会根据机器的GPU数量和训练的批次大小对学习率进行自动缩放，缩放方式详见 [线性扩展规则](https://arxiv.org/abs/1706.02677) ，比如：在 4 块 GPU 并且每张 GPU 上有 2 张图片的情况下 `lr=0.01`，那么在 16 块 GPU 并且每张 GPU 上有 4 张图片的情况下, LR 会自动缩放至`lr=0.08`。
+
+如果不启用该功能，则需要根据 [线性扩展规则](https://arxiv.org/abs/1706.02677) 来手动计算并修改配置文件里面 `optimizer.lr` 的值。
+
+### 使用单 GPU 训练
+
+我们提供了 `tools/train.py` 来开启在单张 GPU 上的训练任务。基本使用如下：
+
+```shell
+python tools/train.py \
+    ${CONFIG_FILE} \
+    [optional arguments]
+```
+
+在训练期间，日志文件和 checkpoint 文件将会被保存在工作目录下，它需要通过配置文件中的 `work_dir` 或者 CLI 参数中的 `--work-dir` 来指定。
+
+默认情况下，模型将在每轮训练之后在 validation 集上进行测试，测试的频率可以通过设置配置文件来指定：
+
+```python
+# 每 12 轮迭代进行一次测试评估
+evaluation = dict(interval=12)
+```
+
+这个工具接受以下参数：
+
+- `--no-validate` (**不建议**): 在训练期间关闭测试.
+- `--work-dir ${WORK_DIR}`: 覆盖工作目录.
+- `--resume-from ${CHECKPOINT_FILE}`: 从某个 checkpoint 文件继续训练.
+- `--options 'Key=value'`: 覆盖使用的配置文件中的其他设置.
+
+**注意**：
+`resume-from` 和 `load-from` 的区别：
+
+`resume-from` 既加载了模型的权重和优化器的状态，也会继承指定 checkpoint 的迭代次数，不会重新开始训练。`load-from` 则是只加载模型的权重，它的训练是从头开始的，经常被用于微调模型。
+
+### 使用 CPU 训练
+
+使用 CPU 训练的流程和使用单 GPU 训练的流程一致，我们仅需要在训练流程开始前禁用 GPU。
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+之后运行单 GPU 训练脚本即可。
+
+**注意**：
+
+我们不推荐用户使用 CPU 进行训练，这太过缓慢。我们支持这个功能是为了方便用户在没有 GPU 的机器上进行调试。
+
+### 在多 GPU 上训练
+
+我们提供了 `tools/dist_train.sh` 来开启在多 GPU 上的训练。基本使用如下：
+
+```shell
+bash ./tools/dist_train.sh \
+    ${CONFIG_FILE} \
+    ${GPU_NUM} \
+    [optional arguments]
+```
+
+可选参数和单 GPU 训练的可选参数一致。
+
+#### 同时启动多个任务
+
+如果你想在一台机器上启动多个任务的话，比如在一个有 8 块 GPU 的机器上启动 2 个需要 4 块GPU的任务，你需要给不同的训练任务指定不同的端口（默认为 29500）来避免冲突。
+
+如果你使用 `dist_train.sh` 来启动训练任务，你可以使用命令来设置端口。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+### 使用多台机器训练
+
+如果您想使用由 ethernet 连接起来的多台机器， 您可以使用以下命令:
+
+在第一台机器上:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+在第二台机器上:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
+```
+
+但是，如果您不使用高速网路连接这几台机器的话，训练将会非常慢。
+
+### 使用 Slurm 来管理任务
+
+Slurm 是一个常见的计算集群调度系统。在 Slurm 管理的集群上，你可以使用 `slurm.sh` 来开启训练任务。它既支持单节点训练也支持多节点训练。
+
+基本使用如下：
+
+```shell
+[GPUS=${GPUS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR}
+```
+
+以下是在一个名称为 _dev_ 的 Slurm 分区上，使用 16 块 GPU 来训练 Mask R-CNN 的例子，并且将 `work-dir` 设置在了某些共享文件系统下。
+
+```shell
+GPUS=16 ./tools/slurm_train.sh dev mask_r50_1x configs/mask_rcnn_r50_fpn_1x_coco.py /nfs/xxxx/mask_rcnn_r50_fpn_1x
+```
+
+你可以查看 [源码](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) 来检查全部的参数和环境变量.
+
+在使用 Slurm 时，端口需要以下方的某个方法之一来设置。
+
+1. 通过 `--options` 来设置端口。我们非常建议用这种方法，因为它无需改变原始的配置文件。
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} --options 'dist_params.port=29500'
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} --options 'dist_params.port=29501'
+   ```
+
+2. 修改配置文件来设置不同的交流端口。
+
+   在 `config1.py` 中，设置：
+
+   ```python
+   dist_params = dict(backend='nccl', port=29500)
+   ```
+
+   在 `config2.py` 中，设置：
+
+   ```python
+   dist_params = dict(backend='nccl', port=29501)
+   ```
+
+   然后你可以使用 `config1.py` 和 `config2.py` 来启动两个任务了。
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
+   ```
diff --git a/docs/zh_cn/2_new_data_model.md b/docs/zh_cn/2_new_data_model.md
new file mode 100755
index 0000000..f760c51
--- /dev/null
+++ b/docs/zh_cn/2_new_data_model.md
@@ -0,0 +1,267 @@
+# 2: 在自定义数据集上进行训练
+
+通过本文档，你将会知道如何使用自定义数据集对预先定义好的模型进行推理，测试以及训练。我们使用 [balloon dataset](https://github.com/matterport/Mask_RCNN/tree/master/samples/balloon) 作为例子来描述整个过程。
+
+基本步骤如下：
+
+1. 准备自定义数据集
+2. 准备配置文件
+3. 在自定义数据集上进行训练，测试和推理。
+
+## 准备自定义数据集
+
+MMDetection 一共支持三种形式应用新数据集：
+
+1. 将数据集重新组织为 COCO 格式。
+2. 将数据集重新组织为一个中间格式。
+3. 实现一个新的数据集。
+
+我们通常建议使用前面两种方法，因为它们通常来说比第三种方法要简单。
+
+在本文档中，我们展示一个例子来说明如何将数据转化为 COCO 格式。
+
+**注意**：MMDetection 现只支持对 COCO 格式的数据集进行 mask AP 的评测。
+
+所以用户如果要进行实例分割，只能将数据转成 COCO 格式。
+
+### COCO标注格式
+
+用于实例分割的 COCO 数据集格式如下所示，其中的键（key）都是必要的，参考[这里](https://cocodataset.org/#format-data)来获取更多细节。
+
+```json
+{
+    "images": [image],
+    "annotations": [annotation],
+    "categories": [category]
+}
+
+
+image = {
+    "id": int,
+    "width": int,
+    "height": int,
+    "file_name": str,
+}
+
+annotation = {
+    "id": int,
+    "image_id": int,
+    "category_id": int,
+    "segmentation": RLE or [polygon],
+    "area": float,
+    "bbox": [x,y,width,height],
+    "iscrowd": 0 or 1,
+}
+
+categories = [{
+    "id": int,
+    "name": str,
+    "supercategory": str,
+}]
+```
+
+现在假设我们使用 balloon dataset。
+
+下载了数据集之后，我们需要实现一个函数将标注格式转化为 COCO 格式。然后我们就可以使用已经实现的 `COCODataset` 类来加载数据并进行训练以及评测。
+
+如果你浏览过新数据集，你会发现格式如下：
+
+```json
+{'base64_img_data': '',
+ 'file_attributes': {},
+ 'filename': '34020010494_e5cb88e1c4_k.jpg',
+ 'fileref': '',
+ 'regions': {'0': {'region_attributes': {},
+   'shape_attributes': {'all_points_x': [1020,
+     1000,
+     994,
+     1003,
+     1023,
+     1050,
+     1089,
+     1134,
+     1190,
+     1265,
+     1321,
+     1361,
+     1403,
+     1428,
+     1442,
+     1445,
+     1441,
+     1427,
+     1400,
+     1361,
+     1316,
+     1269,
+     1228,
+     1198,
+     1207,
+     1210,
+     1190,
+     1177,
+     1172,
+     1174,
+     1170,
+     1153,
+     1127,
+     1104,
+     1061,
+     1032,
+     1020],
+    'all_points_y': [963,
+     899,
+     841,
+     787,
+     738,
+     700,
+     663,
+     638,
+     621,
+     619,
+     643,
+     672,
+     720,
+     765,
+     800,
+     860,
+     896,
+     942,
+     990,
+     1035,
+     1079,
+     1112,
+     1129,
+     1134,
+     1144,
+     1153,
+     1166,
+     1166,
+     1150,
+     1136,
+     1129,
+     1122,
+     1112,
+     1084,
+     1037,
+     989,
+     963],
+    'name': 'polygon'}}},
+ 'size': 1115004}
+```
+
+标注文件时是 JSON 格式的，其中所有键（key）组成了一张图片的所有标注。
+
+其中将 balloon dataset 转化为 COCO 格式的代码如下所示。
+
+```python
+
+import os.path as osp
+import mmcv
+
+def convert_balloon_to_coco(ann_file, out_file, image_prefix):
+    data_infos = mmcv.load(ann_file)
+
+    annotations = []
+    images = []
+    obj_count = 0
+    for idx, v in enumerate(mmcv.track_iter_progress(data_infos.values())):
+        filename = v['filename']
+        img_path = osp.join(image_prefix, filename)
+        height, width = mmcv.imread(img_path).shape[:2]
+
+        images.append(dict(
+            id=idx,
+            file_name=filename,
+            height=height,
+            width=width))
+
+        bboxes = []
+        labels = []
+        masks = []
+        for _, obj in v['regions'].items():
+            assert not obj['region_attributes']
+            obj = obj['shape_attributes']
+            px = obj['all_points_x']
+            py = obj['all_points_y']
+            poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)]
+            poly = [p for x in poly for p in x]
+
+            x_min, y_min, x_max, y_max = (
+                min(px), min(py), max(px), max(py))
+
+
+            data_anno = dict(
+                image_id=idx,
+                id=obj_count,
+                category_id=0,
+                bbox=[x_min, y_min, x_max - x_min, y_max - y_min],
+                area=(x_max - x_min) * (y_max - y_min),
+                segmentation=[poly],
+                iscrowd=0)
+            annotations.append(data_anno)
+            obj_count += 1
+
+    coco_format_json = dict(
+        images=images,
+        annotations=annotations,
+        categories=[{'id':0, 'name': 'balloon'}])
+    mmcv.dump(coco_format_json, out_file)
+```
+
+使用如上的函数，用户可以成功将标注文件转化为 JSON 格式，之后可以使用 `CocoDataset` 对模型进行训练和评测。
+
+## 准备配置文件
+
+第二步需要准备一个配置文件来成功加载数据集。假设我们想要用 balloon dataset 来训练配备了 FPN 的 Mask R-CNN ，如下是我们的配置文件。假设配置文件命名为 `mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py`，相应保存路径为 `configs/balloon/`，配置文件内容如下所示。
+
+```python
+# 这个新的配置文件继承自一个原始配置文件，只需要突出必要的修改部分即可
+_base_ = 'mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py'
+
+# 我们需要对头中的类别数量进行修改来匹配数据集的标注
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=1),
+        mask_head=dict(num_classes=1)))
+
+# 修改数据集相关设置
+dataset_type = 'CocoDataset'
+classes = ('balloon',)
+data = dict(
+    train=dict(
+        img_prefix='balloon/train/',
+        classes=classes,
+        ann_file='balloon/train/annotation_coco.json'),
+    val=dict(
+        img_prefix='balloon/val/',
+        classes=classes,
+        ann_file='balloon/val/annotation_coco.json'),
+    test=dict(
+        img_prefix='balloon/val/',
+        classes=classes,
+        ann_file='balloon/val/annotation_coco.json'))
+
+# 我们可以使用预训练的 Mask R-CNN 来获取更好的性能
+load_from = 'checkpoints/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'
+```
+
+## 训练一个新的模型
+
+为了使用新的配置方法来对模型进行训练，你只需要运行如下命令。
+
+```shell
+python tools/train.py configs/balloon/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py
+```
+
+参考[情况 1](./1_exist_data_model.md)来获取更多详细的使用方法。
+
+## 测试以及推理
+
+为了测试训练完毕的模型，你只需要运行如下命令。
+
+```shell
+python tools/test.py configs/balloon/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py work_dirs/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py/latest.pth --eval bbox segm
+```
+
+参考[情况 1](./1_exist_data_model.md)来获取更多详细的使用方法。
diff --git a/docs/zh_cn/3_exist_data_new_model.md b/docs/zh_cn/3_exist_data_new_model.md
new file mode 100755
index 0000000..e32e373
--- /dev/null
+++ b/docs/zh_cn/3_exist_data_new_model.md
@@ -0,0 +1,283 @@
+# 3: 在标准数据集上训练自定义模型
+
+在本文中，你将知道如何在标准数据集上训练、测试和推理自定义模型。我们将在 cityscapes 数据集上以自定义 Cascade Mask R-CNN R50 模型为例演示整个过程，为了方便说明，我们将 neck 模块中的 `FPN` 替换为 `AugFPN`，并且在训练中的自动增强类中增加 `Rotate` 或 `Translate`。
+
+基本步骤如下所示：
+
+1. 准备标准数据集
+2. 准备你的自定义模型
+3. 准备配置文件
+4. 在标准数据集上对模型进行训练、测试和推理
+
+## 准备标准数据集
+
+在本文中，我们使用 cityscapes 标准数据集为例进行说明。
+
+推荐将数据集根路径采用符号链接方式链接到 `$MMDETECTION/data`。
+
+如果你的文件结构不同，你可能需要在配置文件中进行相应的路径更改。标准的文件组织格式如下所示：
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   ├── cityscapes
+│   │   ├── annotations
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── VOCdevkit
+│   │   ├── VOC2007
+│   │   ├── VOC2012
+```
+
+你也可以通过如下方式设定数据集根路径
+
+```bash
+export MMDET_DATASETS=$data_root
+```
+
+我们将会使用环境便变量 `$MMDET_DATASETS` 作为数据集的根目录，因此你无需再修改相应配置文件的路径信息。
+
+你需要使用脚本 `tools/dataset_converters/cityscapes.py` 将 cityscapes 标注转化为 coco 标注格式。
+
+```shell
+pip install cityscapesscripts
+python tools/dataset_converters/cityscapes.py ./data/cityscapes --nproc 8 --out-dir ./data/cityscapes/annotations
+```
+
+目前在 `cityscapes `文件夹中的配置文件所对应模型是采用 COCO 预训练权重进行初始化的。
+
+如果你的网络不可用或者比较慢，建议你先手动下载对应的预训练权重，否则可能在训练开始时候出现错误。
+
+## 准备你的自定义模型
+
+第二步是准备你的自定义模型或者训练相关配置。假设你想在已有的  Cascade Mask R-CNN R50 检测模型基础上，新增一个新的 neck 模块 `AugFPN` 去代替默认的 `FPN`，以下是具体实现：
+
+### 1 定义新的 neck (例如 AugFPN)
+
+首先创建新文件  `mmdet/models/necks/augfpn.py`.
+
+```python
+from ..builder import NECKS
+
+@NECKS.register_module()
+class AugFPN(nn.Module):
+
+    def __init__(self,
+                in_channels,
+                out_channels,
+                num_outs,
+                start_level=0,
+                end_level=-1,
+                add_extra_convs=False):
+        pass
+
+    def forward(self, inputs):
+        # implementation is ignored
+        pass
+```
+
+### 2 导入模块
+
+你可以采用两种方式导入模块，第一种是在  `mmdet/models/necks/__init__.py` 中添加如下内容
+
+```python
+from .augfpn import AugFPN
+```
+
+第二种是增加如下代码到对应配置中，这种方式的好处是不需要改动代码
+
+```python
+custom_imports = dict(
+    imports=['mmdet.models.necks.augfpn.py'],
+    allow_failed_imports=False)
+```
+
+### 3 修改配置
+
+```python
+neck=dict(
+    type='AugFPN',
+    in_channels=[256, 512, 1024, 2048],
+    out_channels=256,
+    num_outs=5)
+```
+
+关于自定义模型其余相关细节例如实现新的骨架网络，头部网络、损失函数，以及运行时训练配置例如定义新的优化器、使用梯度裁剪、定制训练调度策略和钩子等，请参考文档 [自定义模型](tutorials/customize_models.md) 和 [自定义运行时训练配置](tutorials/customize_runtime.md)。
+
+## 准备配置文件
+
+第三步是准备训练配置所需要的配置文件。假设你打算基于 cityscapes 数据集，在 Cascade Mask R-CNN R50 中新增 `AugFPN` 模块，同时增加 `Rotate` 或者 `Translate` 数据增强策略，假设你的配置文件位于 `configs/cityscapes/` 目录下，并且取名为 `cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py`，则配置信息如下：
+
+```python
+# 继承 base 配置，然后进行针对性修改
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    # 设置为 None，表示不加载 ImageNet 预训练权重，
+    # 后续可以设置 `load_from` 参数用来加载 COCO 预训练权重
+    backbone=dict(init_cfg=None),
+    pretrained=None,
+    # 使用新增的 `AugFPN` 模块代替默认的 `FPN`
+    neck=dict(
+        type='AugFPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    # 我们也需要将 num_classes 从 80 修改为 8 来匹配 cityscapes 数据集标注
+    # 这个修改包括 `bbox_head` 和 `mask_head`.
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                # 将 COCO 类别修改为 cityscapes 类别
+                num_classes=8,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                # 将 COCO 类别修改为 cityscapes 类别
+                num_classes=8,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                # 将 COCO 类别修改为 cityscapes 类别
+                num_classes=8,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            # 将 COCO 类别修改为 cityscapes 类别
+            num_classes=8,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+
+# 覆写 `train_pipeline`，然后新增 `AutoAugment` 训练配置
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [dict(
+                 type='Rotate',
+                 level=5,
+                 img_fill_val=(124, 116, 104),
+                 prob=0.5,
+                 scale=1)
+            ],
+            [dict(type='Rotate', level=7, img_fill_val=(124, 116, 104)),
+             dict(
+                 type='Translate',
+                 level=5,
+                 prob=0.5,
+                 img_fill_val=(124, 116, 104))
+            ],
+        ]),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+
+# 设置每张显卡的批处理大小，同时设置新的训练 pipeline
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=3,
+    # 用新的训练 pipeline 配置覆写 pipeline
+    train=dict(dataset=dict(pipeline=train_pipeline)))
+
+# 设置优化器
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# 设置定制的学习率策略
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8])
+runner = dict(type='EpochBasedRunner', max_epochs=10)
+
+# 我们采用 COCO 预训练过的 Cascade Mask R-CNN R50 模型权重作为初始化权重，可以得到更加稳定的性能
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth'
+```
+
+## 训练新模型
+
+为了能够使用新增配置来训练模型，你可以运行如下命令：
+
+```shell
+python tools/train.py configs/cityscapes/cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py
+```
+
+如果想了解更多用法，可以参考 [例子1](1_exist_data_model.md)。
+
+## 测试和推理
+
+为了能够测试训练好的模型，你可以运行如下命令：
+
+```shell
+python tools/test.py configs/cityscapes/cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py work_dirs/cascade_mask_rcnn_r50_augfpn_autoaug_10e_cityscapes.py/latest.pth --eval bbox segm
+```
+
+如果想了解更多用法，可以参考 [例子1](1_exist_data_model.md)。
diff --git a/docs/zh_cn/Makefile b/docs/zh_cn/Makefile
new file mode 100755
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/zh_cn/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/zh_cn/_static/css/readthedocs.css b/docs/zh_cn/_static/css/readthedocs.css
new file mode 100755
index 0000000..57ed0ad
--- /dev/null
+++ b/docs/zh_cn/_static/css/readthedocs.css
@@ -0,0 +1,6 @@
+.header-logo {
+    background-image: url("../image/mmdet-logo.png");
+    background-size: 156px 40px;
+    height: 40px;
+    width: 156px;
+}
diff --git a/docs/zh_cn/_static/image/mmdet-logo.png b/docs/zh_cn/_static/image/mmdet-logo.png
new file mode 100755
index 0000000..58e2b5e
Binary files /dev/null and b/docs/zh_cn/_static/image/mmdet-logo.png differ
diff --git a/docs/zh_cn/api.rst b/docs/zh_cn/api.rst
new file mode 100755
index 0000000..c75a467
--- /dev/null
+++ b/docs/zh_cn/api.rst
@@ -0,0 +1,108 @@
+mmdet.apis
+--------------
+.. automodule:: mmdet.apis
+    :members:
+
+mmdet.core
+--------------
+
+anchor
+^^^^^^^^^^
+.. automodule:: mmdet.core.anchor
+    :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmdet.core.bbox
+    :members:
+
+export
+^^^^^^^^^^
+.. automodule:: mmdet.core.export
+    :members:
+
+mask
+^^^^^^^^^^
+.. automodule:: mmdet.core.mask
+    :members:
+
+evaluation
+^^^^^^^^^^
+.. automodule:: mmdet.core.evaluation
+    :members:
+
+post_processing
+^^^^^^^^^^^^^^^
+.. automodule:: mmdet.core.post_processing
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmdet.core.utils
+    :members:
+
+mmdet.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmdet.datasets
+    :members:
+
+pipelines
+^^^^^^^^^^
+.. automodule:: mmdet.datasets.pipelines
+    :members:
+
+samplers
+^^^^^^^^^^
+.. automodule:: mmdet.datasets.samplers
+    :members:
+
+api_wrappers
+^^^^^^^^^^
+.. automodule:: mmdet.datasets.api_wrappers
+    :members:
+
+mmdet.models
+--------------
+
+detectors
+^^^^^^^^^^
+.. automodule:: mmdet.models.detectors
+    :members:
+
+backbones
+^^^^^^^^^^
+.. automodule:: mmdet.models.backbones
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmdet.models.necks
+    :members:
+
+dense_heads
+^^^^^^^^^^^^
+.. automodule:: mmdet.models.dense_heads
+    :members:
+
+roi_heads
+^^^^^^^^^^
+.. automodule:: mmdet.models.roi_heads
+    :members:
+
+losses
+^^^^^^^^^^
+.. automodule:: mmdet.models.losses
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmdet.models.utils
+    :members:
+
+mmdet.utils
+--------------
+.. automodule::mmdet.utils
+    :members:
diff --git a/docs/zh_cn/article.md b/docs/zh_cn/article.md
new file mode 100755
index 0000000..9cd6fb6
--- /dev/null
+++ b/docs/zh_cn/article.md
@@ -0,0 +1,53 @@
+## 中文解读文案汇总
+
+### 1 官方解读文案
+
+#### 1.1 框架解读
+
+- **[轻松掌握 MMDetection 整体构建流程(一)](https://zhuanlan.zhihu.com/p/337375549)**
+- **[轻松掌握 MMDetection 整体构建流程(二)](https://zhuanlan.zhihu.com/p/341954021)**
+- **[轻松掌握 MMDetection 中 Head 流程](https://zhuanlan.zhihu.com/p/343433169)**
+
+#### 1.2 算法解读
+
+- **[轻松掌握 MMDetection 中常用算法(一)：RetinaNet 及配置详解](https://zhuanlan.zhihu.com/p/346198300)**
+- **[轻松掌握 MMDetection 中常用算法(二)：Faster R-CNN|Mask R-CNN](https://zhuanlan.zhihu.com/p/349807581)**
+- [轻松掌握 MMDetection 中常用算法(三)：FCOS](https://zhuanlan.zhihu.com/p/358056615)
+- [轻松掌握 MMDetection 中常用算法(四)：ATSS](https://zhuanlan.zhihu.com/p/358125611)
+- [轻松掌握 MMDetection 中常用算法(五)：Cascade R-CNN](https://zhuanlan.zhihu.com/p/360952172)
+- [轻松掌握 MMDetection 中常用算法(六)：YOLOF](https://zhuanlan.zhihu.com/p/370758213)
+- [轻松掌握 MMDetection 中常用算法(七)：CenterNet](https://zhuanlan.zhihu.com/p/374891478)
+- [轻松掌握 MMDetection 中常用算法(八)：YOLACT](https://zhuanlan.zhihu.com/p/376347955)
+- [轻松掌握 MMDetection 中常用算法(九)：AutoAssign](https://zhuanlan.zhihu.com/p/378581552)
+- [YOLOX 在 MMDetection 中复现全流程解析](https://zhuanlan.zhihu.com/p/398545304)
+- [喂喂喂！你可以减重了！小模型 - MMDetection 新增SSDLite 、 MobileNetV2YOLOV3 两大经典算法](https://zhuanlan.zhihu.com/p/402781143)
+
+#### 1.3 工具解读
+
+- [OpenMMLab 中混合精度训练 AMP 的正确打开方式](https://zhuanlan.zhihu.com/p/375224982)
+- [小白都能看懂！手把手教你使用混淆矩阵分析目标检测](https://zhuanlan.zhihu.com/p/443499860)
+- [MMDetection 图像缩放 Resize 详细说明 OpenMMLab](https://zhuanlan.zhihu.com/p/381117525)
+- [拿什么拯救我的 4G 显卡](https://zhuanlan.zhihu.com/p/430123077)
+- [MMDet居然能用MMCls的Backbone？论配置文件的打开方式](https://zhuanlan.zhihu.com/p/436865195)
+
+#### 1.4 知乎问答
+
+- [COCO数据集上1x模式下为什么不采用多尺度训练?](https://www.zhihu.com/question/462170786/answer/1915119662)
+- [MMDetection中SOTA论文源码中将训练过程中BN层的eval打开?](https://www.zhihu.com/question/471189603/answer/2195540892)
+- [基于PyTorch的MMDetection中训练的随机性来自何处？](https://www.zhihu.com/question/453511684/answer/1839683634)
+- [单阶段、双阶段、anchor-based、anchor-free 这四者之间有什么联系吗？](https://www.zhihu.com/question/428972054/answer/1619925296)
+- [目标检测的深度学习方法，有推荐的书籍或资料吗？](https://www.zhihu.com/question/391577080/answer/1612593817)
+- [大佬们，刚入学研究生，想入门目标检测，有什么学习路线可以入门的？](https://www.zhihu.com/question/343768934/answer/1612580715)
+- [目标检测领域还有什么可以做的？](https://www.zhihu.com/question/280703314/answer/1627885518)
+- [如何看待Transformer在CV上的应用前景，未来有可能替代CNN吗？](https://www.zhihu.com/question/437495132/answer/1686380553)
+- [MMDetection如何学习源码？](https://www.zhihu.com/question/451585041/answer/1832498963)
+- [如何具体上手实现目标检测呢？](https://www.zhihu.com/question/341401981/answer/1848561187)
+
+#### 1.5 其他
+
+- **[不得不知的 MMDetection 学习路线(个人经验版)](https://zhuanlan.zhihu.com/p/369826931)**
+- [OpenMMLab 社区专访之 YOLOX 复现篇](https://zhuanlan.zhihu.com/p/405913343)
+
+### 2 社区解读文案
+
+- [手把手带你实现经典检测网络 Mask R-CNN 的推理](https://zhuanlan.zhihu.com/p/414082071)
diff --git a/docs/zh_cn/compatibility.md b/docs/zh_cn/compatibility.md
new file mode 100755
index 0000000..e9ebdd9
--- /dev/null
+++ b/docs/zh_cn/compatibility.md
@@ -0,0 +1,177 @@
+# MMDetection v2.x 兼容性说明
+
+## MMDetection 2.25.0
+
+为了加入 Mask2Former 实例分割模型，对 Mask2Former 的配置文件进行了重命名 [PR #7571](https://github.com/open-mmlab/mmdetection/pull/7571)：
+
+<table align="center">
+    <thead>
+        <tr align='center'>
+            <td>在 v2.25.0 之前</td>
+            <td>v2.25.0 及之后</td>
+        </tr>
+    </thead>
+    <tbody><tr valign='top'>
+    <th>
+
+```
+'mask2former_xxx_coco.py' 代表全景分割的配置文件
+```
+
+</th>
+    <th>
+
+```
+'mask2former_xxx_coco.py' 代表实例分割的配置文件
+'mask2former_xxx_coco-panoptic.py' 代表全景分割的配置文件
+```
+
+</th></tr>
+  </tbody></table>
+
+## MMDetection 2.21.0
+
+为了支持 CPU 训练，MMCV 中进行批处理的 scatter 的代码逻辑已经被修改。我们推荐使用 MMCV v1.4.4 或更高版本，
+更多信息请参考 [MMCV PR #1621](https://github.com/open-mmlab/mmcv/pull/1621).
+
+## MMDetection 2.18.1
+
+### MMCV compatibility
+
+为了修复 BaseTransformerLayer 中的权重引用问题, MultiheadAttention 中 batch first 的逻辑有所改变。
+我们推荐使用 MMCV v1.3.17 或更高版本。 更多信息请参考 [MMCV PR #1418](https://github.com/open-mmlab/mmcv/pull/1418) 。
+
+## MMDetection 2.18.0
+
+### DIIHead 兼容性
+
+为了支持 QueryInst，在 DIIHead 的返回元组中加入了 attn_feats。
+
+## MMDetection v2.14.0
+
+### MMCV 版本
+
+为了修复 EvalHook 优先级过低的问题，MMCV v1.3.8 中所有 hook 的优先级都重新进行了调整，因此 MMDetection v2.14.0 需要依赖最新的 MMCV v1.3.8 版本。 相关信息请参考[PR #1120](https://github.com/open-mmlab/mmcv/pull/1120) ，相关问题请参考[#5343](https://github.com/open-mmlab/mmdetection/issues/5343) 。
+
+### SSD 兼容性
+
+在 v2.14.0 中，为了使 SSD 能够被更灵活地使用，[PR #5291](https://github.com/open-mmlab/mmdetection/pull/5291) 重构了 SSD 的 backbone、neck 和 head。用户可以使用 tools/model_converters/upgrade_ssd_version.py 转换旧版本训练的模型。
+
+```shell
+python tools/model_converters/upgrade_ssd_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH}
+
+```
+
+- OLD_MODEL_PATH：旧版 SSD 模型的路径。
+- NEW_MODEL_PATH：保存转换后模型权重的路径。
+
+## MMDetection v2.12.0
+
+在 v2.12.0 到 v2.18.0（或以上）版本的这段时间，为了提升通用性和便捷性，MMDetection 正在进行大规模重构。在升级到 v2.12.0 后 MMDetection 不可避免地带来了一些 BC Breaking，包括 MMCV 的版本依赖、模型初始化方式、模型 registry 和 mask AP 的评估。
+
+### MMCV 版本
+
+MMDetection v2.12.0 依赖 MMCV v1.3.3 中新增加的功能，包括：使用 `BaseModule` 统一参数初始化，模型 registry，以及[Deformable DETR](https://arxiv.org/abs/2010.04159) 中的 `MultiScaleDeformableAttn` CUDA 算子。
+注意，尽管 MMCV v1.3.2 已经包含了 MMDet 所需的功能，但是存在一些已知的问题。我们建议用户跳过 MMCV v1.3.2 使用 v1.3.3 版本。
+
+### 统一模型初始化
+
+为了统一 OpenMMLab 项目中的参数初始化方式，MMCV 新增加了 `BaseModule` 类，使用 `init_cfg` 参数对模块进行统一且灵活的初始化配置管理。
+现在用户需要在训练脚本中显式调用 `model.init_weights()` 来初始化模型（例如 [这行代码](https://github.com/open-mmlab/mmdetection/blob/master/tools/train.py#L162) ，在这之前则是在 detector 中进行处理的。
+**下游项目必须相应地更新模型初始化方式才能使用 MMDetection v2.12.0**。请参阅 [PR #4750](https://github.com/open-mmlab/mmdetection/pull/4750) 了解详情。
+
+### 统一模型 registry
+
+为了能够使用在其他 OpenMMLab 项目中实现的 backbone，MMDetection v2.12.0 继承了在 MMCV (#760) 中创建的模型 registry。
+这样，只要 OpenMMLab 项目实现了某个 backbone，并且该项目也使用 MMCV 中的 registry，那么用户只需修改配置即可在 MMDetection 中使用该 backbone，不再需要将代码复制到 MMDetection 中。 更多详细信息，请参阅 [PR #5059](https://github.com/open-mmlab/mmdetection/pull/5059) 。
+
+### Mask AP 评估
+
+在 [PR #4898](https://github.com/open-mmlab/mmdetection/pull/4898) 和 v2.12.0 之前，对小、中、大目标的 mask AP 的评估是基于其边界框区域而不是真正的 mask 区域。
+这导致 `APs` 和 `APm` 变得更高但 `APl` 变得更低，但是不会影响整体的 mask AP。 [PR #4898](https://github.com/open-mmlab/mmdetection/pull/4898) 删除了 mask AP 计算中的 `bbox` ，改为使用 mask 区域。
+新的计算方式不会影响整体的 mask AP 评估，与 [Detectron2](https://github.com/facebookresearch/detectron2/)一致。
+
+## 与 MMDetection v1.x 的兼容性
+
+MMDetection v2.0 经过了大规模重构并解决了许多遗留问题。 MMDetection v2.0 不兼容 v1.x 版本，在这两个版本中使用相同的模型权重运行推理会产生不同的结果。 因此，MMDetection v2.0 重新对所有模型进行了 benchmark，并在 model zoo 中提供了新模型的权重和训练记录。
+
+新旧版本的主要的区别有四方面：坐标系、代码库约定、训练超参和模块设计。
+
+### 坐标系
+
+新坐标系与 [Detectron2](https://github.com/facebookresearch/detectron2/) 一致，
+将最左上角的像素的中心视为坐标原点 (0, 0) 而不是最左上角像素的左上角。 因此 COCO 边界框和分割标注中的坐标被解析为范围 `[0，width]` 和 `[0，height]` 中的坐标。 这个修改影响了所有与 bbox 及像素选择相关的计算，变得更加自然且更加准确。
+
+- 在新坐标系中，左上角和右下角为 (x1, y1) (x2, y2) 的框的宽度及高度计算公式为 `width = x2 - x1` 和 `height = y2 - y1`。
+  在 MMDetection v1.x 和之前的版本中，高度和宽度都多了 `+ 1` 的操作。
+  本次修改包括三部分：
+
+  1. box 回归中的检测框变换以及编码/解码。
+  2. IoU 计算。这会影响 ground truth 和检测框之间的匹配以及 NMS 。但对兼容性的影响可以忽略不计。
+  3. Box 的角点坐标为浮点型，不再取整。这能使得检测结果更为准确，也使得检测框和 RoI 的最小尺寸不再为 1，但影响很小。
+
+- Anchor 的中心与特征图的网格点对齐，类型变为 float。
+  在 MMDetection v1.x 和之前的版本中，anchors 是 `int` 类型且没有居中对齐。
+  这会影响 RPN 中的 Anchor 生成和所有基于 Anchor 的方法。
+
+- ROIAlign 更好地与图像坐标系对齐。新的实现来自 [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign) 。
+  当 RoI 用于提取 RoI 特征时，与 MMDetection v1.x 相比默认情况下相差半个像素。
+  能够通过设置 `aligned=False` 而不是 `aligned=True` 来维持旧版本的设置。
+
+- Mask 的裁剪和粘贴更准确。
+
+  1. 我们使用新的 RoIAlign 来提取 mask 目标。 在 MMDetection v1.x 中，bounding box 在提取 mask 目标之前被取整，裁剪过程是 numpy 实现的。 而在新版本中，裁剪的边界框不经过取整直接输入 RoIAlign。 此实现大大加快了训练速度（每次迭代约加速 0.1 秒，1x schedule 训练 Mask R50 时加速约 2 小时）并且理论上会更准确。
+  2. 在 MMDetection v2.0 中，修改后的 `paste_mask()` 函数应该比之前版本更准确。 此更改参考了 [Detectron2](https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/masks.py) 中的修改，可以将 COCO 上的 mask AP 提高约 0.5%。
+
+### 代码库约定
+
+- MMDetection v2.0 更改了类别标签的顺序，减少了回归和 mask 分支里的无用参数并使得顺序更加自然（没有 +1 和 -1）。
+  这会影响模型的所有分类层，使其输出的类别标签顺序发生改变。回归分支和 mask head 的最后一层不再为 K 个类别保留 K+1 个通道，类别顺序与分类分支一致。
+
+  - 在 MMDetection v2.0 中，标签 “K” 表示背景，标签 \[0, K-1\] 对应于 K = num_categories 个对象类别。
+
+  - 在 MMDetection v1.x 及之前的版本中，标签 “0” 表示背景，标签 \[1, K\] 对应 K 个类别。
+
+  - **注意**：softmax RPN 的类顺序在 version\<=2.4.0 中仍然和 1.x 中的一样，而 sigmoid RPN 不受影响。从 MMDetection v2.5.0 开始，所有 head 中的类顺序是统一的。
+
+- 不使用 R-CNN 中的低质量匹配。在 MMDetection v1.x 和之前的版本中，`max_iou_assigner` 会在 RPN 和 R-CNN 训练时给每个 ground truth 匹配低质量框。我们发现这会导致最佳的 GT 框不会被分配给某些边界框，
+  因此，在MMDetection v2.0 的 R-CNN 训练中默认不允许低质量匹配。这有时可能会稍微改善 box AP（约为 0.1%）。
+
+- 单独的宽高比例系数。在 MMDetection v1.x 和以前的版本中，`keep_ratio=True` 时比例系数是单个浮点数，这并不准确，因为宽度和高度的比例系数会有一定的差异。 MMDetection v2.0 对宽度和高度使用单独的比例系数，对 AP 的提升约为 0.1%。
+
+- 修改了 config 文件名称的规范。 由于 model zoo 中模型不断增多， MMDetection v2.0 采用新的命名规则：
+
+  ```shell
+  [model]_(model setting)_[backbone]_[neck]_(norm setting)_(misc)_(gpu x batch)_[schedule]_[dataset].py
+  ```
+
+  其中 (`misc`) 包括 DCN 和 GCBlock 等。更多详细信息在 [配置文件说明文档](config.md) 中说明
+
+- MMDetection v2.0 使用新的 ResNet Caffe backbone 来减少加载预训练模型时的警告。新 backbone 中的大部分权重与以前的相同，但没有 `conv.bias`，且它们使用不同的 `img_norm_cfg`。因此，新的 backbone 不会报 `unexpected keys` 的警告。
+
+### 训练超参
+
+训练超参的调整不会影响模型的兼容性，但会略微提高性能。主要有：
+
+- 通过设置 `nms_post=1000` 和 `max_num=1000`，将 nms 之后的 proposal 数量从 2000 更改为 1000。使 mask AP 和 bbox AP 提高了约 0.2%。
+
+- Mask R-CNN、Faster R-CNN 和 RetinaNet 的默认回归损失从 smooth L1 损失更改为 L1 损失，使得 box AP 整体上都有所提升（约 0.6%）。但是，将 L1-loss 用在 Cascade R-CNN 和 HTC 等其他方法上并不能提高性能，因此我们保留这些方法的原始设置。
+
+- 为简单起见，RoIAlign 层的 `sampling_ratio` 设置为 0。略微提升了 AP（约 0.2% 绝对值）。
+
+- 为了提升训练速度，默认设置在训练过程中不再使用梯度裁剪。大多数模型的性能不会受到影响。对于某些模型（例如 RepPoints），我们依旧使用梯度裁剪来稳定训练过程从而获得更好的性能。
+
+- 因为不再默认使用梯度裁剪，默认 warmup 比率从 1/3 更改为 0.001，以使模型训练预热更加平缓。不过我们重新进行基准测试时发现这种影响可以忽略不计。
+
+### 将模型从 v1.x 升级至 v2.0
+
+用户可以使用脚本 `tools/model_converters/upgrade_model_version.py` 来将 MMDetection 1.x 训练的模型转换为 MMDetection v2.0。转换后的模型可以在 MMDetection v2.0 中运行，但性能略有下降（小于 1% AP）。
+详细信息可以在 `configs/legacy` 中找到。
+
+## pycocotools 兼容性
+
+`mmpycocotools` 是 OpenMMLab 维护的 `pycocotools` 的复刻版，适用于 MMDetection 和 Detectron2。
+在 [PR #4939](https://github.com/open-mmlab/mmdetection/pull/4939) 之前，由于 `pycocotools` 和 `mmpycocotool` 具有相同的包名，如果用户已经安装了 `pyccocotools`（在相同环境下先安装了 Detectron2 ），那么 MMDetection 的安装过程会跳过安装 `mmpycocotool`。 导致 MMDetection 缺少 `mmpycocotools` 而报错。
+但如果在 Detectron2 之前安装 MMDetection，则可以在相同的环境下工作。
+[PR #4939](https://github.com/open-mmlab/mmdetection/pull/4939) 弃用 mmpycocotools，使用官方 pycocotools。
+在 [PR #4939](https://github.com/open-mmlab/mmdetection/pull/4939) 之后，用户能够在相同环境下安装 MMDetection 和 Detectron2，不再需要关注安装顺序。
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
new file mode 100755
index 0000000..1bb57a4
--- /dev/null
+++ b/docs/zh_cn/conf.py
@@ -0,0 +1,118 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath('../../'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMDetection'
+copyright = '2018-2021, OpenMMLab'
+author = 'MMDetection Authors'
+version_file = '../../mmdet/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+# The full version, including alpha/beta/rc tags
+release = get_version()
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton',
+]
+
+myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
+
+autodoc_mock_imports = [
+    'matplotlib', 'pycocotools', 'terminaltables', 'mmdet.version', 'mmcv.ops'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'sphinx_rtd_theme'
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmdetection'
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang':
+    'cn',
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = ['css/readthedocs.css']
+
+language = 'zh_CN'
+
+# -- Extension configuration -------------------------------------------------
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./stat.py'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
diff --git a/docs/zh_cn/conventions.md b/docs/zh_cn/conventions.md
new file mode 100755
index 0000000..acbb21e
--- /dev/null
+++ b/docs/zh_cn/conventions.md
@@ -0,0 +1,75 @@
+# 默认约定
+
+如果你想把 MMDetection 修改为自己的项目，请遵循下面的约定。
+
+## 损失
+
+在 MMDetection 中，`model(**data)` 的返回值是一个字典，包含着所有的损失和评价指标，他们将会由 `model(**data)` 返回。
+
+例如，在 bbox head 中，
+
+```python
+class BBoxHead(nn.Module):
+    ...
+    def loss(self, ...):
+        losses = dict()
+        # 分类损失
+        losses['loss_cls'] = self.loss_cls(...)
+        # 分类准确率
+        losses['acc'] = accuracy(...)
+        # 边界框损失
+        losses['loss_bbox'] = self.loss_bbox(...)
+        return losses
+```
+
+`'bbox_head.loss()'` 在模型 forward 阶段会被调用。返回的字典中包含了 `'loss_bbox'`,`'loss_cls'`,`'acc'`。只有 `'loss_bbox'`, `'loss_cls'` 会被用于反向传播，`'acc'` 只会被作为评价指标来监控训练过程。
+
+我们默认，只有那些键的名称中包含 `'loss'` 的值会被用于反向传播。这个行为可以通过修改 `BaseDetector.train_step()` 来改变。
+
+## 空 proposals
+
+在 MMDetection 中，我们为两阶段方法中空 proposals 的情况增加了特殊处理和单元测试。我们同时需要处理整个 batch 和单一图片中空 proposals 的情况。例如，在 CascadeRoIHead 中，
+
+```python
+# 简单的测试
+...
+
+# 在整个 batch中 都没有 proposals
+if rois.shape[0] == 0:
+    bbox_results = [[
+        np.zeros((0, 5), dtype=np.float32)
+        for _ in range(self.bbox_head[-1].num_classes)
+    ]] * num_imgs
+    if self.with_mask:
+        mask_classes = self.mask_head[-1].num_classes
+        segm_results = [[[] for _ in range(mask_classes)]
+                        for _ in range(num_imgs)]
+        results = list(zip(bbox_results, segm_results))
+    else:
+        results = bbox_results
+    return results
+...
+
+# 在单张图片中没有 proposals
+for i in range(self.num_stages):
+    ...
+    if i < self.num_stages - 1:
+          for j in range(num_imgs):
+                   # 处理空 proposals
+                   if rois[j].shape[0] > 0:
+                       bbox_label = cls_score[j][:, :-1].argmax(dim=1)
+                       refine_roi = self.bbox_head[i].regress_by_class(
+                            rois[j], bbox_label[j], bbox_pred[j], img_metas[j])
+                       refine_roi_list.append(refine_roi)
+```
+
+如果你有自定义的 `RoIHead`, 你可以参考上面的方法来处理空 proposals 的情况。
+
+## 全景分割数据集
+
+在 MMDetection 中，我们支持了 COCO 全景分割数据集 `CocoPanopticDataset`。对于它的实现，我们在这里声明一些默认约定。
+
+1. 在 mmdet\<=2.16.0 时，语义分割标注中的前景和背景标签范围与 MMDetection 中的默认规定有所不同。标签 `0` 代表 `VOID` 标签。
+   从 mmdet=2.17.0 开始，为了和框的类别标注保持一致，语义分割标注的类别标签也改为从 `0` 开始，标签 `255` 代表 `VOID` 类。
+   为了达成这一目标，我们在流程 `Pad` 里支持了设置 `seg` 的填充值的功能。
+2. 在评估中，全景分割结果必须是一个与原图大小相同的图。结果图中每个像素的值有如此形式：`instance_id * INSTANCE_OFFSET + category_id`。
diff --git a/docs/zh_cn/device/npu.md b/docs/zh_cn/device/npu.md
new file mode 100755
index 0000000..b332b4f
--- /dev/null
+++ b/docs/zh_cn/device/npu.md
@@ -0,0 +1,54 @@
+# NPU (华为 昇腾)
+
+## 使用方法
+
+请参考 [MMCV 的安装文档](https://mmcv.readthedocs.io/en/latest/get_started/build.html#build-mmcv-full-on-ascend-npu-machine) 来安装 NPU 版本的 MMCV。
+
+以下展示单机八卡场景的运行指令:
+
+```shell
+bash tools/dist_train.sh configs/ssd/ssd300_coco.py 8
+```
+
+以下展示单机单卡下的运行指令:
+
+```shell
+python tools/train.py configs/ssd/ssd300_coco.py
+```
+
+## 模型验证结果
+
+|        Model         | box AP | mask AP | Config                                                                                                                        | Download                                                                                                     |
+| :------------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------- |
+|     [ssd300](<>)     |  25.6  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ssd300_fp16_coco.py)                               | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/ssd300_coco.log.json)                              |
+|     [ssd512](<>)     |  29.4  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ssd512_fp16_coco.py)                               | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/ssd512_coco.log.json)                              |
+| [ssdlite-mbv2\*](<>) |  20.2  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ssdlite_mobilenetv2_scratch_600e_coco.py)          | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/ssdlite_mobilenetv2_scratch_600e_coco.log.json)    |
+| [retinanet-r18](<>)  |  31.8  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/retinanet_r18_fpn_1x8_1x_coco.py)            | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/retinanet_r18_fpn_1x8_1x_coco.log.json)            |
+| [retinanet-r50](<>)  |  36.6  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py)           | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/retinanet_r50_fpn_1x_coco.log.json)                |
+|   [yolov3-608](<>)   |  34.7  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolo/yolov3_d53_fp16_mstrain-608_273e_coco.py)         | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/yolov3_d53_fp16_mstrain-608_273e_coco.log.json)    |
+|  [yolox-s\*\*](<>)   |  39.9  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolox/yolox_s_8x8_300e_coco.py)                        | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/yolox_s_8x8_300e_coco.log.json)                    |
+| [centernet-r18](<>)  |  26.1  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/centernet/centernet_resnet18_140e_coco.py)             | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/centernet_resnet18_140e_coco.log.json)             |
+|   [fcos-r50\*](<>)   |  36.1  |   ---   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/fcos/fcos_r50_caffe_fpn_gn-head_fp16_1x_bs8x8_coco.py) | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/fcos_r50_caffe_fpn_gn-head_1x_coco_bs8x8.log.json) |
+|   [solov2-r50](<>)   |  ---   |  34.7   | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/solov2/solov2_r50_fpn_1x_coco.py)                      | [log](https://download.openmmlab.com/mmdetection/v2.0/npu/solov2_r50_fpn_1x_coco.log.json)                   |
+
+**注意:**
+
+- 如果没有特别标记，NPU 上的结果与使用 FP32 的 GPU 上的结果结果相同。
+- (\*) 这些模型在 NPU 上的结果与 GPU 上的混合精度训练结果一致，但低于 FP32 的结果。这种情况主要与模型本身在混合精度训练中的特点有关，
+  用户可以自行调整超参数来获得更高精度。
+- (\*\*) GPU 上 yolox-s 在混合精度下的精度为 40.1 低于readme中 40.5 的水平;默认情况下，yolox-s 启用 `persister_woker=True`，但这个参数
+  目前在NPU上存在一些bug，会导致在最后几个epoch由于资源耗尽报错退出，对整体精度影响有限可以忽略。
+
+## Ascend加速模块验证结果
+
+优化方案简介：
+
+1. 修改循环计算为一次整体计算，目的是减少下发指令数量。
+2. 修改索引计算为掩码计算，原因是SIMD架构芯片擅长处理连续数据计算。
+
+|           Model            |                                                          Config                                                           | v100 iter time |       910A iter time       |
+| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------: | :------------: | :------------------------: |
+|    [ascend-ssd300](<>)     |          [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd/ascend_ssd300_fp16_coco.py)           |  0.165s/iter   | 0.383s/iter -> 0.13s/iter  |
+| [ascend-retinanet-r18](<>) | [config](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/ascend_retinanet_r18_fpn_1x8_1x_coco.py) |  0.567s/iter   | 0.780s/iter -> 0.420s/iter |
+
+**以上模型结果由华为昇腾团队提供**
diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md
new file mode 100755
index 0000000..8f9bcf8
--- /dev/null
+++ b/docs/zh_cn/faq.md
@@ -0,0 +1,162 @@
+# 常见问题解答
+
+我们在这里列出了使用时的一些常见问题及其相应的解决方案。 如果您发现有一些问题被遗漏，请随时提 PR 丰富这个列表。 如果您无法在此获得帮助，请使用 [issue模板](https://github.com/open-mmlab/mmdetection/blob/master/.github/ISSUE_TEMPLATE/error-report.md/)创建问题，但是请在模板中填写所有必填信息，这有助于我们更快定位问题。
+
+## MMCV 安装相关
+
+- MMCV 与 MMDetection 的兼容问题: "ConvWS is already registered in conv layer"; "AssertionError: MMCV==xxx is used but incompatible. Please install mmcv>=xxx, \<=xxx."
+
+  请按 [安装说明](https://mmdetection.readthedocs.io/zh_CN/latest/get_started.html#installation) 为你的 MMDetection 安装正确版本的 MMCV 。
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'".
+
+  原因是安装了 `mmcv` 而不是 `mmcv-full`。
+
+  1. `pip uninstall mmcv` 卸载安装的 `mmcv`
+
+  2. 安装 `mmcv-full` 根据 [安装说明](https://mmcv.readthedocs.io/zh/latest/#installation)。
+
+## PyTorch/CUDA 环境相关
+
+- "RTX 30 series card fails when building MMCV or MMDet"
+
+  1. 临时解决方案为使用命令 `MMCV_WITH_OPS=1 MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80' pip install -e .` 进行编译。 常见报错信息为 `nvcc fatal : Unsupported gpu architecture 'compute_86'` 意思是你的编译器不支持 sm_86 架构(包括英伟达 30 系列的显卡)的优化，至 CUDA toolkit 11.0 依旧未支持. 这个命令是通过增加宏 `MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80` 让 nvcc 编译器为英伟达 30 系列显卡进行 `sm_80` 的优化，虽然这有可能会无法发挥出显卡所有性能。
+
+  2. 有开发者已经在 [pytorch/pytorch#47585](https://github.com/pytorch/pytorch/pull/47585) 更新了 PyTorch 默认的编译 flag， 但是我们对此并没有进行测试。
+
+- "invalid device function" or "no kernel image is available for execution".
+
+  1. 检查您正常安装了 CUDA runtime (一般在`/usr/local/`)，或者使用 `nvcc --version` 检查本地版本，有时安装 PyTorch 会顺带安装一个 CUDA runtime，并且实际优先使用 conda 环境中的版本，你可以使用 `conda list cudatoolkit` 查看其版本。
+
+  2. 编译 extension 的 CUDA Toolkit 版本与运行时的 CUDA Toolkit 版本是否相符，
+
+     - 如果您从源码自己编译的，使用 `python mmdet/utils/collect_env.py` 检查编译编译 extension 的 CUDA Toolkit 版本，然后使用 `conda list cudatoolkit` 检查当前 conda 环境是否有 CUDA Toolkit，若有检查版本是否匹配， 如不匹配，更换 conda 环境的 CUDA Toolkit，或者使用匹配的 CUDA Toolkit 中的 nvcc 编译即可，如环境中无 CUDA Toolkit，可以使用 `nvcc -V`。
+
+       等命令查看当前使用的 CUDA runtime。
+
+     - 如果您是通过 pip 下载的预编译好的版本，请确保与当前 CUDA runtime 一致。
+
+  3. 运行 `python mmdet/utils/collect_env.py` 检查是否为正确的 GPU 架构编译的 PyTorch， torchvision， 与 MMCV。 你或许需要设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV，可以参考 [GPU 架构表](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list),
+     例如， 运行 `TORCH_CUDA_ARCH_LIST=7.0 pip install mmcv-full` 为 Volta GPU 编译 MMCV。这种架构不匹配的问题一般会出现在使用一些旧型号的 GPU 时候出现， 例如， Tesla K80。
+
+- "undefined symbol" or "cannot open xxx.so".
+
+  1. 如果这些 symbol 属于 CUDA/C++ (如 libcudart.so 或者 GLIBCXX)，使用 `python mmdet/utils/collect_env.py`检查 CUDA/GCC runtime 与编译 MMCV 的 CUDA 版本是否相同。
+  2. 如果这些 symbols 属于 PyTorch，(例如, symbols containing caffe, aten, and TH), 检查当前 Pytorch 版本是否与编译 MMCV 的版本一致。
+  3. 运行 `python mmdet/utils/collect_env.py` 检查 PyTorch， torchvision， MMCV 等的编译环境与运行环境一致。
+
+- setuptools.sandbox.UnpickleableException: DistutilsSetupError("each element of 'ext_modules' option must be an Extension instance or 2-tuple")
+
+  1. 如果你在使用 miniconda 而不是 anaconda，检查是否正确的安装了 Cython 如 [#3379](https://github.com/open-mmlab/mmdetection/issues/3379).
+  2. 检查环境中的 `setuptools`, `Cython`, and `PyTorch` 相互之间版本是否匹配。
+
+- "Segmentation fault".
+
+  1. 检查 GCC 的版本，通常是因为 PyTorch 版本与 GCC 版本不匹配 （例如 GCC \< 4.9 )，我们推荐用户使用 GCC 5.4，我们也不推荐使用 GCC 5.5， 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题。
+
+  2. 检查是否正确安装了 CUDA 版本的 PyTorch 。
+
+     ```shell
+     python -c 'import torch; print(torch.cuda.is_available())'
+     ```
+
+     是否返回True。
+
+  3. 如果 `torch` 的安装是正确的，检查是否正确编译了 MMCV。
+
+     ```shell
+     python -c 'import mmcv; import mmcv.ops'
+     ```
+
+  4. 如果 MMCV 与 PyTorch 都被正确安装了，则使用 `ipdb`, `pdb` 设置断点，直接查找哪一部分的代码导致了 `segmentation fault`。
+
+## Training 相关
+
+- "Loss goes Nan"
+
+  1. 检查数据的标注是否正常， 长或宽为 0 的框可能会导致回归 loss 变为 nan，一些小尺寸（宽度或高度小于 1）的框在数据增强（例如，instaboost）后也会导致此问题。 因此，可以检查标注并过滤掉那些特别小甚至面积为 0 的框，并关闭一些可能会导致 0 面积框出现数据增强。
+  2. 降低学习率：由于某些原因，例如 batch size 大小的变化， 导致当前学习率可能太大。 您可以降低为可以稳定训练模型的值。
+  3. 延长 warm up 的时间：一些模型在训练初始时对学习率很敏感，您可以把 `warmup_iters` 从 500 更改为 1000 或 2000。
+  4. 添加 gradient clipping: 一些模型需要梯度裁剪来稳定训练过程。 默认的 `grad_clip` 是 `None`,  你可以在 config 设置 `optimizer_config=dict(_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))`  如果你的 config 没有继承任何包含 `optimizer_config=dict(grad_clip=None)`,  你可以直接设置`optimizer_config=dict(grad_clip=dict(max_norm=35, norm_type=2))`.
+
+- "GPU out of memory"
+
+  1. 存在大量 ground truth boxes 或者大量 anchor 的场景，可能在 assigner 会 OOM。 您可以在 assigner 的配置中设置 `gpu_assign_thr=N`，这样当超过 N 个 GT boxes 时，assigner 会通过 CPU 计算 IOU。
+
+  2. 在 backbone 中设置 `with_cp=True`。 这使用 PyTorch 中的 `sublinear strategy` 来降低 backbone 占用的 GPU 显存。
+
+  3. 使用 `config/fp16` 中的示例尝试混合精度训练。`loss_scale` 可能需要针对不同模型进行调整。
+
+  4. 你也可以尝试使用 `AvoidCUDAOOM` 来避免该问题。首先它将尝试调用 `torch.cuda.empty_cache()`。如果失败，将会尝试把输入类型转换到 FP16。如果仍然失败，将会把输入从 GPUs 转换到 CPUs 进行计算。这里提供了两个使用的例子：
+
+     ```python
+     from mmdet.utils import AvoidCUDAOOM
+
+     output = AvoidCUDAOOM.retry_if_cuda_oom(some_function)(input1, input2)
+     ```
+
+     你也可也使用 `AvoidCUDAOOM` 作为装饰器让代码遇到 OOM 的时候继续运行：
+
+     ```python
+     from mmdet.utils import AvoidCUDAOOM
+
+     @AvoidCUDAOOM.retry_if_cuda_oom
+     def function(*args, **kwargs):
+         ...
+         return xxx
+     ```
+
+- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
+
+  1. 这个错误出现在存在参数没有在 forward 中使用，容易在 DDP 中运行不同分支时发生。
+  2. 你可以在 config 设置 `find_unused_parameters = True` 进行训练 (会降低训练速度)。
+  3. 你也可以通过在 config 中的 `optimizer_config` 里设置 `detect_anomalous_params=True` 查找哪些参数没有用到，但是需要 MMCV 的版本 >= 1.4.1。
+
+- 训练中保存最好模型
+
+  可以通过配置 `evaluation = dict(save_best=‘auto’)`开启。在 auto 参数情况下会根据返回的验证结果中的第一个 key 作为选择最优模型的依据，你也可以直接设置评估结果中的 key 来手动设置，例如 `evaluation = dict(save_best=‘mAP’)`。
+
+- 在 Resume 训练中使用 `ExpMomentumEMAHook`
+
+  如果在训练中使用了 `ExpMomentumEMAHook`，那么 resume 时候不能仅仅通过命令行参数 `--resume-from` 或 `--cfg-options resume_from` 实现恢复模型参数功能例如 `python tools/train.py configs/yolox/yolox_s_8x8_300e_coco.py --resume-from ./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth`。以 `yolox_s` 算法为例，由于 `ExpMomentumEMAHook` 需要重新加载权重，你可以通过如下做法实现：
+
+  ```python
+  # 直接打开 configs/yolox/yolox_s_8x8_300e_coco.py 修改所有 resume_from 字段
+  resume_from=./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth
+  custom_hooks=[...
+      dict(
+          type='ExpMomentumEMAHook',
+          resume_from=./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth,
+          momentum=0.0001,
+          priority=49)
+      ]
+  ```
+
+## Evaluation 相关
+
+- 使用 COCO Dataset 的测评接口时, 测评结果中 AP 或者 AR = -1
+  1. 根据COCO数据集的定义，一张图像中的中等物体与小物体面积的阈值分别为 9216（96\*96）与 1024（32\*32）。
+  2. 如果在某个区间没有检测框 AP 与 AR 认定为 -1.
+
+## Model 相关
+
+- **ResNet style 参数说明**
+
+  ResNet style 可选参数允许 `pytorch` 和 `caffe`，其差别在于 Bottleneck 模块。Bottleneck 是 `1x1-3x3-1x1` 堆叠结构，在 `caffe` 模式模式下 stride=2 参数放置在第一个 `1x1` 卷积处，而 `pyorch` 模式下 stride=2 放在第二个 `3x3` 卷积处。一个简单示例如下：
+
+  ```python
+  if self.style == 'pytorch':
+        self.conv1_stride = 1
+        self.conv2_stride = stride
+  else:
+        self.conv1_stride = stride
+        self.conv2_stride = 1
+  ```
+
+- **ResNeXt 参数说明**
+
+  ResNeXt 来自论文 [`Aggregated Residual Transformations for Deep Neural Networks`](https://arxiv.org/abs/1611.05431). 其引入分组卷积，并且通过变量基数来控制组的数量达到精度和复杂度的平衡，其有两个超参 `baseWidth` 和 `cardinality `来控制内部 Bottleneck 模块的基本宽度和分组数参数。以 MMDetection 中配置名为 `mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py` 为例，其中 `mask_rcnn` 代表算法采用 Mask R-CNN，`x101` 代表骨架网络采用 ResNeXt-101，`64x4d`代表 Bottleneck 一共分成 64 组，每组的基本宽度是 4。
+
+- **骨架网络 eval 模式说明**
+
+  因为检测模型通常比较大且输入图片分辨率很高，这会导致检测模型的 batch 很小，通常是 2，这会使得 BatchNorm 在训练过程计算的统计量方差非常大，不如主干网络预训练时得到的统计量稳定，因此在训练是一般都会使用 `norm_eval=True` 模式，直接使用预训练主干网络中的 BatchNorm 统计量，少数使用大 batch 的算法是 `norm_eval=False` 模式，例如 NASFPN。对于没有 ImageNet 预训练的骨架网络，如果 batch 比较小，可以考虑使用 `SyncBN`。
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
new file mode 100755
index 0000000..40d123a
--- /dev/null
+++ b/docs/zh_cn/get_started.md
@@ -0,0 +1,264 @@
+## 依赖
+
+- Linux 和 macOS （Windows 理论上支持）
+- Python 3.7 +
+- PyTorch 1.3+
+- CUDA 9.2+ （如果基于 PyTorch 源码安装，也能够支持 CUDA 9.0）
+- GCC 5+
+- [MMCV](https://mmcv.readthedocs.io/en/latest/#installation)
+
+MMDetection 和 MMCV 版本兼容性如下所示，需要安装正确的 MMCV 版本以避免安装出现问题。
+
+| MMDetection 版本 |         MMCV 版本          |
+| :--------------: | :------------------------: |
+|      master      | mmcv-full>=1.3.17, \<1.8.0 |
+|      2.28.2      | mmcv-full>=1.3.17, \<1.8.0 |
+|      2.28.1      | mmcv-full>=1.3.17, \<1.8.0 |
+|      2.28.0      | mmcv-full>=1.3.17, \<1.8.0 |
+|      2.27.0      | mmcv-full>=1.3.17, \<1.8.0 |
+|      2.26.0      | mmcv-full>=1.3.17, \<1.8.0 |
+|      2.25.3      | mmcv-full>=1.3.17, \<1.7.0 |
+|      2.25.2      | mmcv-full>=1.3.17, \<1.7.0 |
+|      2.25.1      | mmcv-full>=1.3.17, \<1.6.0 |
+|      2.25.0      | mmcv-full>=1.3.17, \<1.6.0 |
+|      2.24.1      | mmcv-full>=1.3.17, \<1.6.0 |
+|      2.24.0      | mmcv-full>=1.3.17, \<1.6.0 |
+|      2.23.0      | mmcv-full>=1.3.17, \<1.5.0 |
+|      2.22.0      | mmcv-full>=1.3.17, \<1.5.0 |
+|      2.21.0      | mmcv-full>=1.3.17, \<1.5.0 |
+|      2.20.0      | mmcv-full>=1.3.17, \<1.5.0 |
+|      2.19.1      | mmcv-full>=1.3.17, \<1.5.0 |
+|      2.19.0      | mmcv-full>=1.3.17, \<1.5.0 |
+|      2.18.1      | mmcv-full>=1.3.17, \<1.4.0 |
+|      2.18.0      | mmcv-full>=1.3.14, \<1.4.0 |
+|      2.17.0      | mmcv-full>=1.3.14, \<1.4.0 |
+|      2.16.0      | mmcv-full>=1.3.8, \<1.4.0  |
+|      2.15.1      | mmcv-full>=1.3.8, \<1.4.0  |
+|      2.15.0      | mmcv-full>=1.3.8, \<1.4.0  |
+|      2.14.0      | mmcv-full>=1.3.8, \<1.4.0  |
+|      2.13.0      | mmcv-full>=1.3.3, \<1.4.0  |
+|      2.12.0      | mmcv-full>=1.3.3, \<1.4.0  |
+|      2.11.0      | mmcv-full>=1.2.4, \<1.4.0  |
+|      2.10.0      | mmcv-full>=1.2.4, \<1.4.0  |
+|      2.9.0       | mmcv-full>=1.2.4, \<1.4.0  |
+|      2.8.0       | mmcv-full>=1.2.4, \<1.4.0  |
+|      2.7.0       | mmcv-full>=1.1.5, \<1.4.0  |
+|      2.6.0       | mmcv-full>=1.1.5, \<1.4.0  |
+|      2.5.0       | mmcv-full>=1.1.5, \<1.4.0  |
+|      2.4.0       | mmcv-full>=1.1.1, \<1.4.0  |
+|      2.3.0       |      mmcv-full==1.0.5      |
+|     2.3.0rc0     |      mmcv-full>=1.0.2      |
+|      2.2.1       |        mmcv==0.6.2         |
+|      2.2.0       |        mmcv==0.6.2         |
+|      2.1.0       |   mmcv>=0.5.9, \<=0.6.1    |
+|      2.0.0       |   mmcv>=0.5.1, \<=0.5.8    |
+
+\*\*注意：\*\*如果已经安装了 mmcv，首先需要使用 `pip uninstall mmcv` 卸载已安装的 mmcv，如果同时安装了 mmcv 和 mmcv-full，将会报 `ModuleNotFoundError` 错误。
+
+## 安装流程
+
+### 从零开始设置脚本
+
+假设当前已经成功安装 CUDA 10.1，这里提供了一个完整的基于 conda 安装 MMDetection 的脚本。您可以参考下一节中的分步安装说明。
+
+```shell
+conda create -n openmmlab python=3.7 pytorch==1.6.0 cudatoolkit=10.1 torchvision -c pytorch -y
+conda activate openmmlab
+pip install openmim
+mim install mmcv-full
+git clone https://github.com/open-mmlab/mmdetection.git
+cd mmdetection
+pip install -r requirements/build.txt
+pip install -v -e .
+```
+
+### 准备环境
+
+1. 使用 conda 新建虚拟环境，并进入该虚拟环境；
+
+   ```shell
+   conda create -n open-mmlab python=3.7 -y
+   conda activate open-mmlab
+   ```
+
+2. 基于 [PyTorch 官网](https://pytorch.org/)安装 PyTorch 和 torchvision，例如：
+
+   ```shell
+   conda install pytorch torchvision -c pytorch
+   ```
+
+   **注意**：需要确保 CUDA 的编译版本和运行版本匹配。可以在 [PyTorch 官网](https://pytorch.org/)查看预编译包所支持的 CUDA 版本。
+
+   `例 1` 例如在 `/usr/local/cuda` 下安装了 CUDA 10.1， 并想安装 PyTorch 1.5，则需要安装支持 CUDA 10.1 的预构建 PyTorch：
+
+   ```shell
+   conda install pytorch cudatoolkit=10.1 torchvision -c pytorch
+   ```
+
+   `例 2` 例如在 `/usr/local/cuda` 下安装了 CUDA 9.2， 并想安装 PyTorch 1.3.1，则需要安装支持 CUDA 9.2 的预构建 PyTorch：
+
+   ```shell
+   conda install pytorch=1.3.1 cudatoolkit=9.2 torchvision=0.4.2 -c pytorch
+   ```
+
+   如果不是安装预构建的包，而是从源码中构建 PyTorch，则可以使用更多的 CUDA 版本，例如 CUDA 9.0。
+
+### 安装 MMDetection
+
+我们建议使用 [MIM](https://github.com/open-mmlab/mim) 来安装 MMDetection：
+
+```shell
+pip install openmim
+mim install mmdet
+```
+
+MIM 能够自动地安装 OpenMMLab 的项目以及对应的依赖包。
+
+或者，可以手动安装 MMDetection：
+
+1. 安装 mmcv-full，我们建议使用预构建包来安装：
+
+   ```shell
+   pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+   ```
+
+   需要把命令行中的 `{cu_version}` 和 `{torch_version}` 替换成对应的版本。例如：在 CUDA 11 和 PyTorch 1.7.0 的环境下，可以使用下面命令安装最新版本的 MMCV：
+
+   ```shell
+   pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html
+   ```
+
+   请参考 [MMCV](https://mmcv.readthedocs.io/en/latest/#installation) 获取不同版本的 MMCV 所兼容的的不同的 PyTorch 和 CUDA 版本。同时，也可以通过以下命令行从源码编译 MMCV：
+
+   ```shell
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   MMCV_WITH_OPS=1 pip install -e .  # 安装好 mmcv-full
+   cd ..
+   ```
+
+   或者，可以直接使用命令行安装：
+
+   ```shell
+   pip install mmcv-full
+   ```
+
+   PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv-full 只提供 1.x.0 的编译包。如果你的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv-full。
+
+   ```
+   # 我们可以忽略 PyTorch 的小版本号
+   pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7/index.html
+   ```
+
+2. 安装 MMDetection：
+
+   你可以直接通过如下命令从 pip 安装使用 mmdetection:
+
+   ```shell
+   pip install mmdet
+   ```
+
+   或者从 git 仓库编译源码
+
+   ```shell
+   git clone https://github.com/open-mmlab/mmdetection.git
+   cd mmdetection
+   pip install -r requirements/build.txt
+   pip install -v -e .  # or "python setup.py develop"
+   ```
+
+3. 安装额外的依赖以使用 Instaboost, 全景分割, 或者 LVIS 数据集
+
+   ```shell
+   # 安装 instaboost 依赖
+   pip install instaboostfast
+   # 安装全景分割依赖
+   pip install git+https://github.com/cocodataset/panopticapi.git
+   # 安装 LVIS 数据集依赖
+   pip install git+https://github.com/lvis-dataset/lvis-api.git
+   # 安装 albumentations 依赖
+   pip install -r requirements/albu.txt
+   ```
+
+**注意：**
+
+(1) 按照上述说明，MMDetection 安装在 `dev` 模式下，因此在本地对代码做的任何修改都会生效，无需重新安装；
+
+(2) 如果希望使用 `opencv-python-headless` 而不是 `opencv-python`， 可以在安装 MMCV 之前安装；
+
+(3) 一些安装依赖是可以选择的。例如只需要安装最低运行要求的版本，则可以使用 `pip install -v -e .` 命令。如果希望使用可选择的像 `albumentations` 和 `imagecorruptions` 这种依赖项，可以使用 `pip install -r requirements/optional.txt` 进行手动安装，或者在使用 `pip` 时指定所需的附加功能（例如 `pip install -v -e .[optional]`），支持附加功能的有效键值包括 `all`、`tests`、`build` 以及 `optional` 。
+
+(4) 如果希望使用 `albumentations`，我们建议使用 `pip install -r requirements/albu.txt` 或者 `pip install -U albumentations --no-binary qudida,albumentations` 进行安装。 如果简单地使用 `pip install albumentations>=0.3.2` 进行安装，则会同时安装 `opencv-python-headless`（即便已经安装了 `opencv-python` 也会再次安装）。我们建议在安装 `albumentations` 后检查环境，以确保没有同时安装 `opencv-python` 和 `opencv-python-headless`，因为同时安装可能会导致一些问题。更多细节请参考[官方文档](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies)。
+
+### 只在 CPU 安装
+
+我们的代码能够建立在只使用 CPU 的环境（CUDA 不可用）。
+
+在 CPU 模式下，可以进行模型训练（需要 MMCV 版本 >= 1.4.4)、测试或者推理，然而以下功能将在 CPU 模式下不能使用：
+
+- Deformable Convolution
+- Modulated Deformable Convolution
+- ROI pooling
+- Deformable ROI pooling
+- CARAFE: Content-Aware ReAssembly of FEatures
+- SyncBatchNorm
+- CrissCrossAttention: Criss-Cross Attention
+- MaskedConv2d
+- Temporal Interlace Shift
+- nms_cuda
+- sigmoid_focal_loss_cuda
+- bbox_overlaps
+
+因此，如果尝试使用包含上述操作的模型进行训练/测试/推理，将会报错。下表列出了由于依赖上述算子而无法在 CPU 上运行的相关模型：
+
+|                          操作                           |                                           模型                                           |
+| :-----------------------------------------------------: | :--------------------------------------------------------------------------------------: |
+| Deformable Convolution/Modulated Deformable Convolution | DCN、Guided Anchoring、RepPoints、CentripetalNet、VFNet、CascadeRPN、NAS-FCOS、DetectoRS |
+|                      MaskedConv2d                       |                                     Guided Anchoring                                     |
+|                         CARAFE                          |                                          CARAFE                                          |
+|                      SyncBatchNorm                      |                                         ResNeSt                                          |
+
+### 另一种选择： Docker 镜像
+
+我们提供了 [Dockerfile](https://github.com/open-mmlab/mmdetection/blob/master/docker/Dockerfile) 来生成镜像，请确保 [docker](https://docs.docker.com/engine/install/) 的版本 >= 19.03。
+
+```shell
+# 基于 PyTorch 1.6, CUDA 10.1 生成镜像
+docker build -t mmdetection docker/
+```
+
+运行命令：
+
+```shell
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmdetection/data mmdetection
+```
+
+### 使用多个 MMDetection 版本进行开发
+
+训练和测试的脚本已经在 PYTHONPATH 中进行了修改，以确保脚本使用当前目录中的 MMDetection。
+
+要使环境中安装默认的 MMDetection 而不是当前正在在使用的，可以删除出现在相关脚本中的代码：
+
+```shell
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+```
+
+## 验证
+
+为了验证是否正确安装了 MMDetection 和所需的环境，我们可以运行示例的 Python 代码来初始化检测器并推理一个演示图像：
+
+```python
+from mmdet.apis import init_detector, inference_detector
+
+config_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+# 从 model zoo 下载 checkpoint 并放在 `checkpoints/` 文件下
+# 网址为: http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
+checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+device = 'cuda:0'
+# 初始化检测器
+model = init_detector(config_file, checkpoint_file, device=device)
+# 推理演示图像
+inference_detector(model, 'demo/demo.jpg')
+```
+
+如果成功安装 MMDetection，则上面的代码可以完整地运行。
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
new file mode 100755
index 0000000..872606c
--- /dev/null
+++ b/docs/zh_cn/index.rst
@@ -0,0 +1,62 @@
+Welcome to MMDetection's documentation!
+=======================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 开始你的第一步
+
+   get_started.md
+   model_zoo.md
+   article.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 快速启动
+
+   1_exist_data_model.md
+   2_new_data_model.md
+   3_exist_data_new_model.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 教程
+
+   tutorials/index.rst
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 实用工具与脚本
+
+   useful_tools.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 说明
+
+   conventions.md
+   compatibility.md
+   faq.md
+
+.. toctree::
+   :caption: 语言切换
+
+   switch_language.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 接口文档（英文）
+
+   api.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 设备支持
+
+   device/npu.md
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/zh_cn/make.bat b/docs/zh_cn/make.bat
new file mode 100755
index 0000000..922152e
--- /dev/null
+++ b/docs/zh_cn/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/zh_cn/model_zoo.md b/docs/zh_cn/model_zoo.md
new file mode 100755
index 0000000..84f5727
--- /dev/null
+++ b/docs/zh_cn/model_zoo.md
@@ -0,0 +1,337 @@
+# 模型库
+
+## 镜像地址
+
+从 MMDetection V2.0 起，我们只通过阿里云维护模型库。V1.x 版本的模型已经弃用。
+
+## 共同设置
+
+- 所有模型都是在 `coco_2017_train` 上训练，在 `coco_2017_val` 上测试。
+- 我们使用分布式训练。
+- 所有 pytorch-style 的 ImageNet 预训练主干网络来自 PyTorch 的模型库，caffe-style 的预训练主干网络来自 detectron2 最新开源的模型。
+- 为了与其他代码库公平比较，文档中所写的 GPU 内存是8个 GPU 的 `torch.cuda.max_memory_allocated()` 的最大值，此值通常小于 nvidia-smi 显示的值。
+- 我们以网络 forward 和后处理的时间加和作为推理时间，不包含数据加载时间。所有结果通过 [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/benchmark.py) 脚本计算所得。该脚本会计算推理 2000 张图像的平均时间。
+
+## ImageNet 预训练模型
+
+通过 ImageNet 分类任务预训练的主干网络进行初始化是很常见的操作。所有预训练模型的链接都可以在 [open_mmlab](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json) 中找到。根据 `img_norm_cfg` 和原始权重，我们可以将所有 ImageNet 预训练模型分为以下几种情况：
+
+- TorchVision：torchvision 模型权重，包含 ResNet50, ResNet101。`img_norm_cfg` 为 `dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)`。
+- Pycls：[pycls](https://github.com/facebookresearch/pycls) 模型权重，包含 RegNetX。`img_norm_cfg` 为 `dict(   mean=[103.530, 116.280, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False)`。
+- MSRA styles：[MSRA](https://github.com/KaimingHe/deep-residual-networks) 模型权重，包含 ResNet50_Caffe，ResNet101_Caffe。`img_norm_cfg` 为 `dict(   mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)`。
+- Caffe2 styles：现阶段只包含 ResNext101_32x8d。`img_norm_cfg` 为 `dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)`。
+- Other styles: SSD 的 `img_norm_cfg` 为 `dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)`，YOLOv3 的 `img_norm_cfg` 为 `dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)`。
+
+MMdetection 常用到的主干网络细节如下表所示：
+
+| 模型             | 来源        | 链接                                                                                                                                                                                                | 描述                                                                                                                                                                                                                       |
+| ---------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ResNet50         | TorchVision | [torchvision 中的 ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth)                                                                                                             | 来自 [torchvision 中的 ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth)。                                                                                                                             |
+| ResNet101        | TorchVision | [torchvision 中的 ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth)                                                                                                           | 来自 [torchvision 中的 ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth)。                                                                                                                           |
+| RegNetX          | Pycls       | [RegNetX_3.2gf](https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth)，[RegNetX_800mf](https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth) 等 | 来自 [pycls](https://github.com/facebookresearch/pycls)。                                                                                                                                                                  |
+| ResNet50_Caffe   | MSRA        | [MSRA 中的 ResNet-50](https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth)                                                                                              | 由 [Detectron2 中的 R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl) 转化的副本。原始权重文件来自 [MSRA 中的原始 ResNet-50](https://github.com/KaimingHe/deep-residual-networks)。    |
+| ResNet101_Caffe  | MSRA        | [MSRA 中的 ResNet-101](https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth)                                                                                            | 由 [Detectron2 中的 R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl) 转化的副本。原始权重文件来自 [MSRA 中的原始 ResNet-101](https://github.com/KaimingHe/deep-residual-networks)。 |
+| ResNext101_32x8d | Caffe2      | [Caffe2 ResNext101_32x8d](https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth)                                                                                        | 由 [Detectron2 中的 X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl) 转化的副本。原始 ResNeXt-101-32x8d 由 FB 使用 Caffe2 训练。                                        |
+
+## Baselines
+
+### RPN
+
+请参考 [RPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/rpn)。
+
+### Faster R-CNN
+
+请参考 [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn)。
+
+### Mask R-CNN
+
+请参考 [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn)。
+
+### Fast R-CNN (使用提前计算的 proposals)
+
+请参考 [Fast R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/fast_rcnn)。
+
+### RetinaNet
+
+请参考 [RetinaNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet)。
+
+### Cascade R-CNN and Cascade Mask R-CNN
+
+请参考 [Cascade R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/cascade_rcnn)。
+
+### Hybrid Task Cascade (HTC)
+
+请参考 [HTC](https://github.com/open-mmlab/mmdetection/blob/master/configs/htc)。
+
+### SSD
+
+请参考 [SSD](https://github.com/open-mmlab/mmdetection/blob/master/configs/ssd)。
+
+### Group Normalization (GN)
+
+请参考 [Group Normalization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn)。
+
+### Weight Standardization
+
+请参考 [Weight Standardization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn+ws)。
+
+### Deformable Convolution v2
+
+请参考 [Deformable Convolutional Networks](https://github.com/open-mmlab/mmdetection/blob/master/configs/dcn)。
+
+### CARAFE: Content-Aware ReAssembly of FEatures
+
+请参考 [CARAFE](https://github.com/open-mmlab/mmdetection/blob/master/configs/carafe)。
+
+### Instaboost
+
+请参考 [Instaboost](https://github.com/open-mmlab/mmdetection/blob/master/configs/instaboost)。
+
+### Libra R-CNN
+
+请参考 [Libra R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/libra_rcnn)。
+
+### Guided Anchoring
+
+请参考 [Guided Anchoring](https://github.com/open-mmlab/mmdetection/blob/master/configs/guided_anchoring)。
+
+### FCOS
+
+请参考 [FCOS](https://github.com/open-mmlab/mmdetection/blob/master/configs/fcos)。
+
+### FoveaBox
+
+请参考 [FoveaBox](https://github.com/open-mmlab/mmdetection/blob/master/configs/foveabox)。
+
+### RepPoints
+
+请参考 [RepPoints](https://github.com/open-mmlab/mmdetection/blob/master/configs/reppoints)。
+
+### FreeAnchor
+
+请参考 [FreeAnchor](https://github.com/open-mmlab/mmdetection/blob/master/configs/free_anchor)。
+
+### Grid R-CNN (plus)
+
+请参考 [Grid R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/grid_rcnn)。
+
+### GHM
+
+请参考 [GHM](https://github.com/open-mmlab/mmdetection/blob/master/configs/ghm)。
+
+### GCNet
+
+请参考 [GCNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/gcnet)。
+
+### HRNet
+
+请参考 [HRNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/hrnet)。
+
+### Mask Scoring R-CNN
+
+请参考 [Mask Scoring R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/ms_rcnn)。
+
+### Train from Scratch
+
+请参考 [Rethinking ImageNet Pre-training](https://github.com/open-mmlab/mmdetection/blob/master/configs/scratch)。
+
+### NAS-FPN
+
+请参考 [NAS-FPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/nas_fpn)。
+
+### ATSS
+
+请参考 [ATSS](https://github.com/open-mmlab/mmdetection/blob/master/configs/atss)。
+
+### FSAF
+
+请参考 [FSAF](https://github.com/open-mmlab/mmdetection/blob/master/configs/fsaf)。
+
+### RegNetX
+
+请参考 [RegNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/regnet)。
+
+### Res2Net
+
+请参考 [Res2Net](https://github.com/open-mmlab/mmdetection/blob/master/configs/res2net)。
+
+### GRoIE
+
+请参考 [GRoIE](https://github.com/open-mmlab/mmdetection/blob/master/configs/groie)。
+
+### Dynamic R-CNN
+
+请参考 [Dynamic R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/dynamic_rcnn)。
+
+### PointRend
+
+请参考 [PointRend](https://github.com/open-mmlab/mmdetection/blob/master/configs/point_rend)。
+
+### DetectoRS
+
+请参考 [DetectoRS](https://github.com/open-mmlab/mmdetection/blob/master/configs/detectors)。
+
+### Generalized Focal Loss
+
+请参考 [Generalized Focal Loss](https://github.com/open-mmlab/mmdetection/blob/master/configs/gfl)。
+
+### CornerNet
+
+请参考 [CornerNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/cornernet)。
+
+### YOLOv3
+
+请参考 [YOLOv3](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolo)。
+
+### PAA
+
+请参考 [PAA](https://github.com/open-mmlab/mmdetection/blob/master/configs/paa)。
+
+### SABL
+
+请参考 [SABL](https://github.com/open-mmlab/mmdetection/blob/master/configs/sabl)。
+
+### CentripetalNet
+
+请参考 [CentripetalNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/centripetalnet)。
+
+### ResNeSt
+
+请参考 [ResNeSt](https://github.com/open-mmlab/mmdetection/blob/master/configs/resnest)。
+
+### DETR
+
+请参考 [DETR](https://github.com/open-mmlab/mmdetection/blob/master/configs/detr)。
+
+### Deformable DETR
+
+请参考 [Deformable DETR](https://github.com/open-mmlab/mmdetection/blob/master/configs/deformable_detr)。
+
+### AutoAssign
+
+请参考 [AutoAssign](https://github.com/open-mmlab/mmdetection/blob/master/configs/autoassign)。
+
+### YOLOF
+
+请参考 [YOLOF](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolof)。
+
+### Seesaw Loss
+
+请参考 [Seesaw Loss](https://github.com/open-mmlab/mmdetection/blob/master/configs/seesaw_loss)。
+
+### CenterNet
+
+请参考 [CenterNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/centernet)。
+
+### YOLOX
+
+请参考 [YOLOX](https://github.com/open-mmlab/mmdetection/blob/master/configs/yolox)。
+
+### PVT
+
+请参考 [PVT](https://github.com/open-mmlab/mmdetection/blob/master/configs/pvt)。
+
+### SOLO
+
+请参考 [SOLO](https://github.com/open-mmlab/mmdetection/blob/master/configs/solo)。
+
+### QueryInst
+
+请参考 [QueryInst](https://github.com/open-mmlab/mmdetection/blob/master/configs/queryinst)。
+
+### RF-Next
+
+请参考 [RF-Next](https://github.com/open-mmlab/mmdetection/blob/master/configs/rfnext).
+
+### Other datasets
+
+我们还在 [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/master/configs/pascal_voc)，[Cityscapes](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes) 和 [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/master/configs/wider_face) 上对一些方法进行了基准测试。
+
+### Pre-trained Models
+
+我们还通过多尺度训练和更长的训练策略来训练用 ResNet-50 和 [RegNetX-3.2G](https://github.com/open-mmlab/mmdetection/blob/master/configs/regnet) 作为主干网络的 [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn) 和 [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn)。这些模型可以作为下游任务的预训练模型。
+
+## 速度基准
+
+### 训练速度基准
+
+我们提供 [analyze_logs.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/analyze_logs.py) 来得到训练中每一次迭代的平均时间。示例请参考 [Log Analysis](https://mmdetection.readthedocs.io/en/latest/useful_tools.html#log-analysis)。
+
+我们与其他流行框架的 Mask R-CNN 训练速度进行比较（数据是从 [detectron2](https://github.com/facebookresearch/detectron2/blob/master/docs/notes/benchmarks.md/) 复制而来）。在 mmdetection 中，我们使用 [mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py) 进行基准测试。它与 detectron2 的 [mask_rcnn_R_50_FPN_noaug_1x.yaml](https://github.com/facebookresearch/detectron2/blob/master/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml) 设置完全一样。同时，我们还提供了[模型权重](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_compare_20200518-10127928.pth)和[训练 log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_20200518_105755.log.json) 作为参考。为了跳过 GPU 预热时间，吞吐量按照100-500次迭代之间的平均吞吐量来计算。
+
+| 框架                                                                                   | 吞吐量 (img/s) |
+| -------------------------------------------------------------------------------------- | -------------- |
+| [Detectron2](https://github.com/facebookresearch/detectron2)                           | 62             |
+| [MMDetection](https://github.com/open-mmlab/mmdetection)                               | 61             |
+| [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/)          | 53             |
+| [tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN) | 50             |
+| [simpledet](https://github.com/TuSimple/simpledet/)                                    | 39             |
+| [Detectron](https://github.com/facebookresearch/Detectron)                             | 19             |
+| [matterport/Mask_RCNN](https://github.com/matterport/Mask_RCNN/)                       | 14             |
+
+### 推理时间基准
+
+我们提供 [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/benchmark.py) 对推理时间进行基准测试。此脚本将推理 2000 张图片并计算忽略前 5 次推理的平均推理时间。可以通过设置 `LOG-INTERVAL` 来改变 log 输出间隔（默认为 50）。
+
+```shell
+python tools/benchmark.py ${CONFIG} ${CHECKPOINT} [--log-interval $[LOG-INTERVAL]] [--fuse-conv-bn]
+```
+
+模型库中，所有模型在基准测量推理时间时都没设置 `fuse-conv-bn`, 此设置可以使推理时间更短。
+
+## 与 Detectron2 对比
+
+我们在速度和精度方面对 mmdetection 和 [Detectron2](https://github.com/facebookresearch/detectron2.git) 进行对比。对比所使用的 detectron2 的 commit id 为 [185c27e](https://github.com/facebookresearch/detectron2/tree/185c27e4b4d2d4c68b5627b3765420c6d7f5a659)(30/4/2020)。
+为了公平对比，我们所有的实验都在同一机器下进行。
+
+### 硬件
+
+- 8 NVIDIA Tesla V100 (32G) GPUs
+- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+### 软件环境
+
+- Python 3.7
+- PyTorch 1.4
+- CUDA 10.1
+- CUDNN 7.6.03
+- NCCL 2.4.08
+
+### 精度
+
+| 模型                                                                                                                                   | 训练策略 | Detectron2                                                                                                                             | mmdetection | 下载                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py) | 1x       | [37.9](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml)                 | 38.0        | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-5324cff8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco_20200429_234554.log.json)             |
+| [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py)  | 1x       | [38.6 & 35.2](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml) | 38.8 & 35.4 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco-dbecf295.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco_20200430_054239.log.json) |
+| [Retinanet](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/retinanet_r50_caffe_fpn_mstrain_1x_coco.py)        | 1x       | [36.5](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml)                   | 37.0        | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco-586977a0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco_20200430_014748.log.json)                     |
+
+### 训练速度
+
+训练速度使用 s/iter 来度量。结果越低越好。
+
+| 模型         | Detectron2 | mmdetection |
+| ------------ | ---------- | ----------- |
+| Faster R-CNN | 0.210      | 0.216       |
+| Mask R-CNN   | 0.261      | 0.265       |
+| Retinanet    | 0.200      | 0.205       |
+
+### 推理速度
+
+推理速度通过单张 GPU 下的 fps(img/s) 来度量，越高越好。
+为了与 Detectron2 保持一致，我们所写的推理时间除去了数据加载时间。
+对于 Mask RCNN，我们去除了后处理中 RLE 编码的时间。
+我们在括号中给出了官方给出的速度。由于硬件差异，官方给出的速度会比我们所测试得到的速度快一些。
+
+| 模型         | Detectron2  | mmdetection |
+| ------------ | ----------- | ----------- |
+| Faster R-CNN | 25.6 (26.3) | 22.2        |
+| Mask R-CNN   | 22.5 (23.3) | 19.6        |
+| Retinanet    | 17.8 (18.2) | 20.6        |
+
+### 训练内存
+
+| 模型         | Detectron2 | mmdetection |
+| ------------ | ---------- | ----------- |
+| Faster R-CNN | 3.0        | 3.8         |
+| Mask R-CNN   | 3.4        | 3.9         |
+| Retinanet    | 3.9        | 3.4         |
diff --git a/docs/zh_cn/projects.md b/docs/zh_cn/projects.md
new file mode 100755
index 0000000..6b9d300
--- /dev/null
+++ b/docs/zh_cn/projects.md
@@ -0,0 +1,48 @@
+# 基于 MMDetection 的项目
+
+有许多开源项目都是基于 MMDetection 搭建的，我们在这里列举一部分作为样例，展示如何基于 MMDetection 搭建您自己的项目。
+由于这个页面列举的项目并不完全，我们欢迎社区提交 Pull Request 来更新这个文档。
+
+## MMDetection 的拓展项目
+
+一些项目拓展了 MMDetection 的边界，如将 MMDetection 拓展支持 3D 检测或者将 MMDetection 用于部署。
+它们展示了 MMDetection 的许多可能性，所以我们在这里也列举一些。
+
+- [OTEDetection](https://github.com/opencv/mmdetection): OpenVINO training extensions for object detection.
+- [MMDetection3d](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+
+## 研究项目
+
+同样有许多研究论文是基于 MMDetection 进行的。许多论文都发表在了顶级的会议或期刊上，或者对社区产生了深远的影响。
+为了向社区提供一个可以参考的论文列表，帮助大家开发或者比较新的前沿算法，我们在这里也遵循会议的时间顺序列举了一些论文。
+MMDetection 中已经支持的算法不在此列。
+
+- Involution: Inverting the Inherence of Convolution for Visual Recognition, CVPR21. [\[paper\]](https://arxiv.org/abs/2103.06255)[\[github\]](https://github.com/d-li14/involution)
+- Multiple Instance Active Learning for Object Detection, CVPR 2021. [\[paper\]](https://openaccess.thecvf.com/content/CVPR2021/papers/Yuan_Multiple_Instance_Active_Learning_for_Object_Detection_CVPR_2021_paper.pdf)[\[github\]](https://github.com/yuantn/MI-AOD)
+- Adaptive Class Suppression Loss for Long-Tail Object Detection, CVPR 2021. [\[paper\]](https://arxiv.org/abs/2104.00885)[\[github\]](https://github.com/CASIA-IVA-Lab/ACSL)
+- Generalizable Pedestrian Detection: The Elephant In The Room, CVPR2021. [\[paper\]](https://arxiv.org/abs/2003.08799)[\[github\]](https://github.com/hasanirtiza/Pedestron)
+- Group Fisher Pruning for Practical Network Compression, ICML2021. [\[paper\]](https://github.com/jshilong/FisherPruning/blob/main/resources/paper.pdf)[\[github\]](https://github.com/jshilong/FisherPruning)
+- Overcoming Classifier Imbalance for Long-tail Object Detection with Balanced Group Softmax, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Li_Overcoming_Classifier_Imbalance_for_Long-Tail_Object_Detection_With_Balanced_Group_CVPR_2020_paper.pdf)[\[github\]](https://github.com/FishYuLi/BalancedGroupSoftmax)
+- Coherent Reconstruction of Multiple Humans from a Single Image, CVPR2020. [\[paper\]](https://jiangwenpl.github.io/multiperson/)[\[github\]](https://github.com/JiangWenPL/multiperson)
+- Look-into-Object: Self-supervised Structure Modeling for Object Recognition, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhou_Look-Into-Object_Self-Supervised_Structure_Modeling_for_Object_Recognition_CVPR_2020_paper.pdf)[\[github\]](https://github.com/JDAI-CV/LIO)
+- Video Panoptic Segmentation, CVPR2020. [\[paper\]](https://arxiv.org/abs/2006.11339)[\[github\]](https://github.com/mcahny/vps)
+- D2Det: Towards High Quality Object Detection and Instance Segmentation, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cao_D2Det_Towards_High_Quality_Object_Detection_and_Instance_Segmentation_CVPR_2020_paper.html)[\[github\]](https://github.com/JialeCao001/D2Det)
+- CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.09119)[\[github\]](https://github.com/KiveeDong/CentripetalNet)
+- Learning a Unified Sample Weighting Network for Object Detection, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cai_Learning_a_Unified_Sample_Weighting_Network_for_Object_Detection_CVPR_2020_paper.html)[\[github\]](https://github.com/caiqi/sample-weighting-network)
+- Scale-equalizing Pyramid Convolution for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2005.03101) [\[github\]](https://github.com/jshilong/SEPC)
+- Revisiting the Sibling Head in Object Detector, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.07540)[\[github\]](https://github.com/Sense-X/TSD)
+- PolarMask: Single Shot Instance Segmentation with Polar Representation, CVPR2020. [\[paper\]](https://arxiv.org/abs/1909.13226)[\[github\]](https://github.com/xieenze/PolarMask)
+- Hit-Detector: Hierarchical Trinity Architecture Search for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.11818)[\[github\]](https://github.com/ggjy/HitDet.pytorch)
+- ZeroQ: A Novel Zero Shot Quantization Framework, CVPR2020. [\[paper\]](https://arxiv.org/abs/2001.00281)[\[github\]](https://github.com/amirgholami/ZeroQ)
+- CBNet: A Novel Composite Backbone Network Architecture for Object Detection, AAAI2020. [\[paper\]](https://aaai.org/Papers/AAAI/2020GB/AAAI-LiuY.1833.pdf)[\[github\]](https://github.com/VDIGPKU/CBNet)
+- RDSNet: A New Deep Architecture for Reciprocal Object Detection and Instance Segmentation, AAAI2020. [\[paper\]](https://arxiv.org/abs/1912.05070)[\[github\]](https://github.com/wangsr126/RDSNet)
+- Training-Time-Friendly Network for Real-Time Object Detection, AAAI2020. [\[paper\]](https://arxiv.org/abs/1909.00700)[\[github\]](https://github.com/ZJULearning/ttfnet)
+- Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution, NeurIPS 2019. [\[paper\]](https://arxiv.org/abs/1909.06720)[\[github\]](https://github.com/thangvubk/Cascade-RPN)
+- Reasoning R-CNN: Unifying Adaptive Global Reasoning into Large-scale Object Detection, CVPR2019. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2019/papers/Xu_Reasoning-RCNN_Unifying_Adaptive_Global_Reasoning_Into_Large-Scale_Object_Detection_CVPR_2019_paper.pdf)[\[github\]](https://github.com/chanyn/Reasoning-RCNN)
+- Learning RoI Transformer for Oriented Object Detection in Aerial Images, CVPR2019. [\[paper\]](https://arxiv.org/abs/1812.00155)[\[github\]](https://github.com/dingjiansw101/AerialDetection)
+- SOLO: Segmenting Objects by Locations. [\[paper\]](https://arxiv.org/abs/1912.04488)[\[github\]](https://github.com/WXinlong/SOLO)
+- SOLOv2: Dynamic, Faster and Stronger. [\[paper\]](https://arxiv.org/abs/2003.10152)[\[github\]](https://github.com/WXinlong/SOLO)
+- Dense Peppoints: Representing Visual Objects with Dense Point Sets. [\[paper\]](https://arxiv.org/abs/1912.11473)[\[github\]](https://github.com/justimyhxu/Dense-RepPoints)
+- IterDet: Iterative Scheme for Object Detection in Crowded Environments. [\[paper\]](https://arxiv.org/abs/2005.05708)[\[github\]](https://github.com/saic-vul/iterdet)
+- Cross-Iteration Batch Normalization. [\[paper\]](https://arxiv.org/abs/2002.05712)[\[github\]](https://github.com/Howal/Cross-iterationBatchNorm)
+- A Ranking-based, Balanced Loss Function Unifying Classification and Localisation in Object Detection, NeurIPS2020 [\[paper\]](https://arxiv.org/abs/2009.13592)[\[github\]](https://github.com/kemaloksuz/aLRPLoss)
diff --git a/docs/zh_cn/robustness_benchmarking.md b/docs/zh_cn/robustness_benchmarking.md
new file mode 100755
index 0000000..28a6759
--- /dev/null
+++ b/docs/zh_cn/robustness_benchmarking.md
@@ -0,0 +1,109 @@
+# 检测器鲁棒性检查
+
+## 介绍
+
+我们提供了在 [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484) 中定义的「图像损坏基准测试」上测试目标检测和实例分割模型的工具。
+此页面提供了如何使用该基准测试的基本教程。
+
+```latex
+@article{michaelis2019winter,
+  title={Benchmarking Robustness in Object Detection:
+    Autonomous Driving when Winter is Coming},
+  author={Michaelis, Claudio and Mitzkus, Benjamin and
+    Geirhos, Robert and Rusak, Evgenia and
+    Bringmann, Oliver and Ecker, Alexander S. and
+    Bethge, Matthias and Brendel, Wieland},
+  journal={arXiv:1907.07484},
+  year={2019}
+}
+```
+
+![image corruption example](../resources/corruptions_sev_3.png)
+
+## 关于基准测试
+
+要将结果提交到基准测试，请访问[基准测试主页](https://github.com/bethgelab/robust-detection-benchmark)
+
+基准测试是仿照 [imagenet-c 基准测试](https://github.com/hendrycks/robustness)，由 Dan Hendrycks 和 Thomas Dietterich 在[Benchmarking Neural Network Robustness to Common Corruptions and Perturbations](https://arxiv.org/abs/1903.12261)(ICLR 2019)中发表。
+
+图像损坏变换功能包含在此库中，但可以使用以下方法单独安装：
+
+```shell
+pip install imagecorruptions
+```
+
+与 imagenet-c 相比，我们必须进行一些更改以处理任意大小的图像和灰度图像。
+我们还修改了“运动模糊”和“雪”损坏，以解除对于 linux 特定库的依赖，
+否则必须单独安装这些库。有关详细信息，请参阅 [imagecorruptions](https://github.com/bethgelab/imagecorruptions)。
+
+## 使用预训练模型进行推理
+
+我们提供了一个测试脚本来评估模型在基准测试中提供的各种损坏变换组合下的性能。
+
+### 在数据集上测试
+
+- [x] 单张 GPU 测试
+- [ ] 多张 GPU 测试
+- [ ] 可视化检测结果
+
+您可以使用以下命令在基准测试中使用 15 种损坏变换来测试模型性能。
+
+```shell
+# single-gpu testing
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
+```
+
+也可以选择其它不同类型的损坏变换。
+
+```shell
+# noise
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions noise
+
+# blur
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions blur
+
+# wetaher
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions weather
+
+# digital
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions digital
+```
+
+或者使用一组自定义的损坏变换，例如：
+
+```shell
+# gaussian noise, zoom blur and snow
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions gaussian_noise zoom_blur snow
+```
+
+最后，我们也可以选择施加在图像上的损坏变换的严重程度。
+严重程度从 1 到 5 逐级增强，0 表示不对图像施加损坏变换，即原始图像数据。
+
+```shell
+# severity 1
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 1
+
+# severities 0,2,4
+python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 0 2 4
+```
+
+## 模型测试结果
+
+下表是各模型在 COCO 2017val 上的测试结果。
+
+|        Model        |      Backbone       |  Style  | Lr schd | box AP clean | box AP corr. | box % | mask AP clean | mask AP corr. | mask % |
+| :-----------------: | :-----------------: | :-----: | :-----: | :----------: | :----------: | :---: | :-----------: | :-----------: | :----: |
+|    Faster R-CNN     |      R-50-FPN       | pytorch |   1x    |     36.3     |     18.2     | 50.2  |       -       |       -       |   -    |
+|    Faster R-CNN     |      R-101-FPN      | pytorch |   1x    |     38.5     |     20.9     | 54.2  |       -       |       -       |   -    |
+|    Faster R-CNN     |   X-101-32x4d-FPN   | pytorch |   1x    |     40.1     |     22.3     | 55.5  |       -       |       -       |   -    |
+|    Faster R-CNN     |   X-101-64x4d-FPN   | pytorch |   1x    |     41.3     |     23.4     | 56.6  |       -       |       -       |   -    |
+|    Faster R-CNN     |    R-50-FPN-DCN     | pytorch |   1x    |     40.0     |     22.4     | 56.1  |       -       |       -       |   -    |
+|    Faster R-CNN     | X-101-32x4d-FPN-DCN | pytorch |   1x    |     43.4     |     26.7     | 61.6  |       -       |       -       |   -    |
+|     Mask R-CNN      |      R-50-FPN       | pytorch |   1x    |     37.3     |     18.7     | 50.1  |     34.2      |     16.8      |  49.1  |
+|     Mask R-CNN      |    R-50-FPN-DCN     | pytorch |   1x    |     41.1     |     23.3     | 56.7  |     37.2      |     20.7      |  55.7  |
+|    Cascade R-CNN    |      R-50-FPN       | pytorch |   1x    |     40.4     |     20.1     | 49.7  |       -       |       -       |   -    |
+| Cascade Mask R-CNN  |      R-50-FPN       | pytorch |   1x    |     41.2     |     20.7     | 50.2  |     35.7      |     17.6      |  49.3  |
+|      RetinaNet      |      R-50-FPN       | pytorch |   1x    |     35.6     |     17.8     | 50.1  |       -       |       -       |   -    |
+| Hybrid Task Cascade | X-101-64x4d-FPN-DCN | pytorch |   1x    |     50.6     |     32.7     | 64.7  |     43.8      |     28.1      |  64.0  |
+
+由于对图像的损坏变换存在随机性，测试结果可能略有不同。
diff --git a/docs/zh_cn/stat.py b/docs/zh_cn/stat.py
new file mode 100755
index 0000000..9625c62
--- /dev/null
+++ b/docs/zh_cn/stat.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import functools as func
+import glob
+import os.path as osp
+import re
+
+import numpy as np
+
+url_prefix = 'https://github.com/open-mmlab/mmdetection/blob/master/'
+
+files = sorted(glob.glob('../configs/*/README.md'))
+
+stats = []
+titles = []
+num_ckpts = 0
+
+for f in files:
+    url = osp.dirname(f.replace('../', url_prefix))
+
+    with open(f, 'r') as content_file:
+        content = content_file.read()
+
+    title = content.split('\n')[0].replace('# ', '').strip()
+    ckpts = set(x.lower().strip()
+                for x in re.findall(r'\[model\]\((https?.*)\)', content))
+
+    if len(ckpts) == 0:
+        continue
+
+    _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)]
+    assert len(_papertype) > 0
+    papertype = _papertype[0]
+
+    paper = set([(papertype, title)])
+
+    titles.append(title)
+    num_ckpts += len(ckpts)
+
+    statsmsg = f"""
+\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts)
+"""
+    stats.append((paper, ckpts, statsmsg))
+
+allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats])
+msglist = '\n'.join(x for _, _, x in stats)
+
+papertypes, papercounts = np.unique([t for t, _ in allpapers],
+                                    return_counts=True)
+countstr = '\n'.join(
+    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+
+modelzoo = f"""
+# Model Zoo Statistics
+
+* Number of papers: {len(set(titles))}
+{countstr}
+
+* Number of checkpoints: {num_ckpts}
+
+{msglist}
+"""
+
+with open('modelzoo_statistics.md', 'w') as f:
+    f.write(modelzoo)
diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md
new file mode 100755
index 0000000..b2c4ad9
--- /dev/null
+++ b/docs/zh_cn/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmdetection.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmdetection.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/zh_cn/tutorials/config.md b/docs/zh_cn/tutorials/config.md
new file mode 100755
index 0000000..34ef58b
--- /dev/null
+++ b/docs/zh_cn/tutorials/config.md
@@ -0,0 +1,528 @@
+# 教程 1: 学习配置文件
+
+我们在配置文件中支持了继承和模块化，这便于进行各种实验。如果需要检查配置文件，可以通过运行 `python tools/misc/print_config.py /PATH/TO/CONFIG` 来查看完整的配置。
+
+## 通过脚本参数修改配置
+
+当运行 `tools/train.py` 和 `tools/test.py` 时，可以通过 `--cfg-options` 来修改配置文件。
+
+- 更新字典链中的配置
+
+  可以按照原始配置文件中的 dict 键顺序地指定配置预选项。例如，使用 `--cfg-options model.backbone.norm_eval=False` 将模型主干网络中的所有 BN 模块都改为 `train` 模式。
+
+- 更新配置列表中的键
+
+  在配置文件里，一些字典型的配置被包含在列表中。例如，数据训练流程 `data.train.pipeline` 通常是一个列表，比如 `[dict(type='LoadImageFromFile'), ...]`。如果需要将 `'LoadImageFromFile'` 改成 `'LoadImageFromWebcam'`，需要写成下述形式： `--cfg-options data.train.pipeline.0.type=LoadImageFromWebcam`。
+
+- 更新列表或元组的值
+
+  如果要更新的值是列表或元组。例如，配置文件通常设置 `workflow=[('train', 1)]`，如果需要改变这个键，可以通过 `--cfg-options workflow="[(train,1),(val,1)]"` 来重新设置。需要注意，引号 " 是支持列表或元组数据类型所必需的，并且在指定值的引号内**不允许**有空格。
+
+## 配置文件结构
+
+在 `config/_base_` 文件夹下有 4 个基本组件类型，分别是：数据集(dataset)，模型(model)，训练策略(schedule)和运行时的默认设置(default runtime)。许多方法，例如 Faster R-CNN、Mask R-CNN、Cascade R-CNN、RPN、SSD 能够很容易地构建出来。由 `_base_` 下的组件组成的配置，被我们称为 _原始配置(primitive)_。
+
+对于同一文件夹下的所有配置，推荐**只有一个**对应的**原始配置**文件。所有其他的配置文件都应该继承自这个**原始配置**文件。这样就能保证配置文件的最大继承深度为 3。
+
+为了便于理解，我们建议贡献者继承现有方法。例如，如果在 Faster R-CNN 的基础上做了一些修改，用户首先可以通过指定 `_base_ = ../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py` 来继承基础的 Faster R-CNN 结构，然后修改配置文件中的必要参数以完成继承。
+
+如果你在构建一个与任何现有方法不共享结构的全新方法，那么可以在 `configs` 文件夹下创建一个新的例如 `xxx_rcnn` 文件夹。更多细节请参考 [MMCV](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html) 文档。
+
+## 配置文件名称风格
+
+我们遵循以下样式来命名配置文件。建议贡献者遵循相同的风格。
+
+```
+{model}_[model setting]_{backbone}_{neck}_[norm setting]_[misc]_[gpu x batch_per_gpu]_{schedule}_{dataset}
+```
+
+`{xxx}` 是被要求的文件 `[yyy]` 是可选的。
+
+- `{model}`： 模型种类，例如 `faster_rcnn`, `mask_rcnn` 等。
+- `[model setting]`： 特定的模型，例如 `htc` 中的`without_semantic`， `reppoints` 中的 `moment` 等。
+- `{backbone}`： 主干网络种类例如 `r50` (ResNet-50), `x101` (ResNeXt-101) 等。
+- `{neck}`：  Neck 模型的种类包括 `fpn`, `pafpn`, `nasfpn`, `c4 ` 等。
+- `[norm_setting]`： 默认使用 `bn` (Batch Normalization)，其他指定可以有 `gn` (Group Normalization)， `syncbn` (Synchronized Batch Normalization) 等。
+  `gn-head`/`gn-neck` 表示 GN 仅应用于网络的 Head 或 Neck， `gn-all` 表示 GN 用于整个模型， 例如主干网络、Neck 和 Head。
+- `[misc]`： 模型中各式各样的设置/插件，例如 `dconv`、 `gcb`、 `attention`、`albu`、 `mstrain` 等。
+- `[gpu x batch_per_gpu]`：GPU 数量和每个 GPU 的样本数，默认使用 `8x2`。
+- `{schedule}`： 训练方案，选项是 `1x`、 `2x`、 `20e` 等。`1x`  和 `2x` 分别代表 12 epoch 和 24 epoch，`20e` 在级联模型中使用，表示 20 epoch。对于 `1x`/`2x`，初始学习率在第 8/16 和第 11/22 epoch 衰减 10 倍；对于 `20e` ，初始学习率在第 16 和第 19 epoch 衰减 10 倍。
+- `{dataset}`：数据集，例如 `coco`、 `cityscapes`、 `voc_0712`、 `wider_face` 等。
+
+## 弃用的 train_cfg/test_cfg
+
+`train_cfg` 和 `test_cfg` 在配置文件中已弃用，请在模型配置中指定它们。原始配置结构如下：
+
+```python
+# 已经弃用的形式
+model = dict(
+    type=...,
+    ...
+)
+train_cfg=dict(...)
+test_cfg=dict(...)
+```
+
+推荐的配置结构如下：
+
+```python
+# 推荐的形式
+model = dict(
+    type=...,
+    ...
+train_cfg=dict(...),
+          test_cfg=dict(...),
+)
+```
+
+## Mask R-CNN 配置文件示例
+
+为了帮助用户对 MMDetection 检测系统中的完整配置和模块有一个基本的了解，我们对使用 ResNet50 和 FPN 的 Mask R-CNN 的配置文件进行简要注释说明。更详细的用法和各个模块对应的替代方案，请参考 API 文档。
+
+```python
+model = dict(
+    type='MaskRCNN',  # 检测器(detector)名称
+    backbone=dict(  # 主干网络的配置文件
+        type='ResNet',  # 主干网络的类别，可用选项请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py#L308
+        depth=50,  # 主干网络的深度，对于 ResNet 和 ResNext 通常设置为 50 或 101。
+        num_stages=4,  # 主干网络状态(stages)的数目，这些状态产生的特征图作为后续的 head 的输入。
+        out_indices=(0, 1, 2, 3),  # 每个状态产生的特征图输出的索引。
+        frozen_stages=1,  # 第一个状态的权重被冻结
+        norm_cfg=dict(  # 归一化层(norm layer)的配置项。
+            type='BN',  # 归一化层的类别，通常是 BN 或 GN。
+            requires_grad=True),  # 是否训练归一化里的 gamma 和 beta。
+        norm_eval=True,  # 是否冻结 BN 里的统计项。
+        style='pytorch',  # 主干网络的风格，'pytorch' 意思是步长为2的层为 3x3 卷积， 'caffe' 意思是步长为2的层为 1x1 卷积。
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),  # 加载通过 ImageNet 预训练的模型
+    neck=dict(
+        type='FPN',  # 检测器的 neck 是 FPN，我们同样支持 'NASFPN', 'PAFPN' 等，更多细节可以参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/fpn.py#L10。
+        in_channels=[256, 512, 1024, 2048],  # 输入通道数，这与主干网络的输出通道一致
+        out_channels=256,  # 金字塔特征图每一层的输出通道
+        num_outs=5),  # 输出的范围(scales)
+    rpn_head=dict(
+        type='RPNHead',  # RPN_head 的类型是 'RPNHead', 我们也支持 'GARPNHead' 等，更多细节可以参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/rpn_head.py#L12。
+        in_channels=256,  # 每个输入特征图的输入通道，这与 neck 的输出通道一致。
+        feat_channels=256,  # head 卷积层的特征通道。
+        anchor_generator=dict(  # 锚点(Anchor)生成器的配置。
+            type='AnchorGenerator',  # 大多是方法使用 AnchorGenerator 作为锚点生成器, SSD 检测器使用 `SSDAnchorGenerator`。更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/anchor/anchor_generator.py#L10。
+            scales=[8],  # 锚点的基本比例，特征图某一位置的锚点面积为 scale * base_sizes
+            ratios=[0.5, 1.0, 2.0],  # 高度和宽度之间的比率。
+            strides=[4, 8, 16, 32, 64]),  # 锚生成器的步幅。这与 FPN 特征步幅一致。 如果未设置 base_sizes，则当前步幅值将被视为 base_sizes。
+        bbox_coder=dict(  # 在训练和测试期间对框进行编码和解码。
+            type='DeltaXYWHBBoxCoder',  # 框编码器的类别，'DeltaXYWHBBoxCoder' 是最常用的，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py#L9。
+            target_means=[0.0, 0.0, 0.0, 0.0],  # 用于编码和解码框的目标均值
+            target_stds=[1.0, 1.0, 1.0, 1.0]),  # 用于编码和解码框的标准差
+        loss_cls=dict(  # 分类分支的损失函数配置
+            type='CrossEntropyLoss',  # 分类分支的损失类型，我们也支持 FocalLoss 等。
+            use_sigmoid=True,  # RPN通常进行二分类，所以通常使用sigmoid函数。
+            los_weight=1.0),  # 分类分支的损失权重。
+        loss_bbox=dict(  # 回归分支的损失函数配置。
+            type='L1Loss',  # 损失类型，我们还支持许多 IoU Losses 和 Smooth L1-loss 等，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/smooth_l1_loss.py#L56。
+            loss_weight=1.0)),  # 回归分支的损失权重。
+    roi_head=dict(  # RoIHead 封装了两步(two-stage)/级联(cascade)检测器的第二步。
+        type='StandardRoIHead',  # RoI head 的类型，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/standard_roi_head.py#L10。
+        bbox_roi_extractor=dict(  # 用于 bbox 回归的 RoI 特征提取器。
+            type='SingleRoIExtractor',  # RoI 特征提取器的类型，大多数方法使用  SingleRoIExtractor，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/roi_extractors/single_level.py#L10。
+            roi_layer=dict(  # RoI 层的配置
+                type='RoIAlign',  # RoI 层的类别, 也支持 DeformRoIPoolingPack 和 ModulatedDeformRoIPoolingPack，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/roi_align/roi_align.py#L79。
+                output_size=7,  # 特征图的输出大小。
+                sampling_ratio=0),  # 提取 RoI 特征时的采样率。0 表示自适应比率。
+            out_channels=256,  # 提取特征的输出通道。
+            featmap_strides=[4, 8, 16, 32]),  # 多尺度特征图的步幅，应该与主干的架构保持一致。
+        bbox_head=dict(  # RoIHead 中 box head 的配置.
+            type='Shared2FCBBoxHead',  # bbox head 的类别，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py#L177。
+            in_channels=256,  # bbox head 的输入通道。 这与 roi_extractor 中的 out_channels 一致。
+            fc_out_channels=1024,  # FC 层的输出特征通道。
+            roi_feat_size=7,  # 候选区域(Region of Interest)特征的大小。
+            num_classes=80,  # 分类的类别数量。
+            bbox_coder=dict(  # 第二阶段使用的框编码器。
+                type='DeltaXYWHBBoxCoder',  # 框编码器的类别，大多数情况使用 'DeltaXYWHBBoxCoder'。
+                target_means=[0.0, 0.0, 0.0, 0.0],  # 用于编码和解码框的均值
+                target_stds=[0.1, 0.1, 0.2, 0.2]),  # 编码和解码的标准差。因为框更准确，所以值更小，常规设置时 [0.1, 0.1, 0.2, 0.2]。
+            reg_class_agnostic=False,  # 回归是否与类别无关。
+            loss_cls=dict(  # 分类分支的损失函数配置
+                type='CrossEntropyLoss',  # 分类分支的损失类型，我们也支持 FocalLoss 等。
+                use_sigmoid=False,  # 是否使用 sigmoid。
+                loss_weight=1.0),  # 分类分支的损失权重。
+            loss_bbox=dict(  # 回归分支的损失函数配置。
+                type='L1Loss',  # 损失类型，我们还支持许多 IoU Losses 和 Smooth L1-loss 等。
+                loss_weight=1.0)),  # 回归分支的损失权重。
+        mask_roi_extractor=dict(  # 用于 mask 生成的 RoI 特征提取器。
+            type='SingleRoIExtractor',  # RoI 特征提取器的类型，大多数方法使用 SingleRoIExtractor。
+            roi_layer=dict(  # 提取实例分割特征的 RoI 层配置
+                type='RoIAlign',  # RoI 层的类型，也支持 DeformRoIPoolingPack 和 ModulatedDeformRoIPoolingPack。
+                output_size=14,  # 特征图的输出大小。
+                sampling_ratio=0),  # 提取 RoI 特征时的采样率。
+            out_channels=256,  # 提取特征的输出通道。
+            featmap_strides=[4, 8, 16, 32]),  # 多尺度特征图的步幅。
+        mask_head=dict(  # mask 预测 head 模型
+            type='FCNMaskHead',  # mask head 的类型，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py#L21。
+            num_convs=4,  # mask head 中的卷积层数
+            in_channels=256,  # 输入通道，应与 mask roi extractor 的输出通道一致。
+            conv_out_channels=256,  # 卷积层的输出通道。
+            num_classes=80,  # 要分割的类别数。
+            loss_mask=dict(  # mask 分支的损失函数配置。
+                type='CrossEntropyLoss',  # 用于分割的损失类型。
+                use_mask=True,  # 是否只在正确的类中训练 mask。
+                loss_weight=1.0))),  # mask 分支的损失权重.
+    train_cfg = dict(  # rpn 和 rcnn 训练超参数的配置
+        rpn=dict(  # rpn 的训练配置
+            assigner=dict(  # 分配器(assigner)的配置
+                type='MaxIoUAssigner',  # 分配器的类型，MaxIoUAssigner 用于许多常见的检测器，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/max_iou_assigner.py#L10。
+                pos_iou_thr=0.7,  # IoU >= 0.7(阈值) 被视为正样本。
+                neg_iou_thr=0.3,  # IoU < 0.3(阈值) 被视为负样本。
+                min_pos_iou=0.3,  # 将框作为正样本的最小 IoU 阈值。
+                match_low_quality=True,  # 是否匹配低质量的框(更多细节见 API 文档).
+                ignore_iof_thr=-1),  # 忽略 bbox 的 IoF 阈值。
+            sampler=dict(  # 正/负采样器(sampler)的配置
+                type='RandomSampler',  # 采样器类型，还支持 PseudoSampler 和其他采样器，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/samplers/random_sampler.py#L8。
+                num=256,  # 样本数量。
+                pos_fraction=0.5,  # 正样本占总样本的比例。
+                neg_pos_ub=-1,  # 基于正样本数量的负样本上限。
+                add_gt_as_proposals=False),  # 采样后是否添加 GT 作为 proposal。
+            allowed_border=-1,  # 填充有效锚点后允许的边框。
+            pos_weight=-1,  # 训练期间正样本的权重。
+            debug=False),  # 是否设置调试(debug)模式
+        rpn_proposal=dict(  # 在训练期间生成 proposals 的配置
+            nms_across_levels=False,  # 是否对跨层的 box 做 NMS。仅适用于 `GARPNHead` ，naive rpn 不支持 nms cross levels。
+            nms_pre=2000,  # NMS 前的 box 数
+            nms_post=1000,  # NMS 要保留的 box 的数量，只在 GARPNHead 中起作用。
+            max_per_img=1000,  # NMS 后要保留的 box 数量。
+            nms=dict( # NMS 的配置
+                type='nms',  # NMS 的类别
+                iou_threshold=0.7 # NMS 的阈值
+                ),
+            min_bbox_size=0),  # 允许的最小 box 尺寸
+        rcnn=dict(  # roi head 的配置。
+            assigner=dict(  # 第二阶段分配器的配置，这与 rpn 中的不同
+                type='MaxIoUAssigner',  # 分配器的类型，MaxIoUAssigner 目前用于所有 roi_heads。更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/max_iou_assigner.py#L10。
+                pos_iou_thr=0.5,  # IoU >= 0.5(阈值)被认为是正样本。
+                neg_iou_thr=0.5,  # IoU < 0.5(阈值)被认为是负样本。
+                min_pos_iou=0.5,  # 将 box 作为正样本的最小 IoU 阈值
+                match_low_quality=False,  # 是否匹配低质量下的 box(有关更多详细信息，请参阅 API 文档)。
+                ignore_iof_thr=-1),  # 忽略 bbox 的 IoF 阈值
+            sampler=dict(
+                type='RandomSampler',  #采样器的类型，还支持 PseudoSampler 和其他采样器，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/samplers/random_sampler.py#L8。
+                num=512,  # 样本数量
+                pos_fraction=0.25,  # 正样本占总样本的比例。.
+                neg_pos_ub=-1,  # 基于正样本数量的负样本上限。.
+                add_gt_as_proposals=True
+            ),  # 采样后是否添加 GT 作为 proposal。
+            mask_size=28,  # mask 的大小
+            pos_weight=-1,  # 训练期间正样本的权重。
+            debug=False)),  # 是否设置调试模式。
+    test_cfg = dict(  # 用于测试 rpn 和 rcnn 超参数的配置
+        rpn=dict(  # 测试阶段生成 proposals 的配置
+            nms_across_levels=False,  # 是否对跨层的 box 做 NMS。仅适用于`GARPNHead`，naive rpn 不支持做 NMS cross levels。
+            nms_pre=1000,  # NMS 前的 box 数
+            nms_post=1000,  # NMS 要保留的 box 的数量，只在`GARPNHead`中起作用。
+            max_per_img=1000,  # NMS 后要保留的 box 数量
+            nms=dict( # NMS 的配置
+                type='nms',  # NMS 的类型
+                iou_threshold=0.7 # NMS 阈值
+                ),
+            min_bbox_size=0),  # box 允许的最小尺寸
+        rcnn=dict(  # roi heads 的配置
+            score_thr=0.05,  # bbox 的分数阈值
+            nms=dict(  # 第二步的 NMS 配置
+                type='nms',  # NMS 的类型
+                iou_thr=0.5),  # NMS 的阈值
+            max_per_img=100,  # 每张图像的最大检测次数
+            mask_thr_binary=0.5)))  # mask 预处的阈值
+
+dataset_type = 'CocoDataset'  # 数据集类型，这将被用来定义数据集。
+data_root = 'data/coco/'  # 数据的根路径。
+img_norm_cfg = dict(  # 图像归一化配置，用来归一化输入的图像。
+    mean=[123.675, 116.28, 103.53],  # 预训练里用于预训练主干网络模型的平均值。
+    std=[58.395, 57.12, 57.375],  # 预训练里用于预训练主干网络模型的标准差。
+    to_rgb=True
+)  #  预训练里用于预训练主干网络的图像的通道顺序。
+train_pipeline = [  # 训练流程
+    dict(type='LoadImageFromFile'),  # 第 1 个流程，从文件路径里加载图像。
+    dict(
+        type='LoadAnnotations',  # 第 2 个流程，对于当前图像，加载它的注释信息。
+        with_bbox=True,  # 是否使用标注框(bounding box)， 目标检测需要设置为 True。
+        with_mask=True,  # 是否使用 instance mask，实例分割需要设置为 True。
+        poly2mask=False),  # 是否将 polygon mask 转化为 instance mask, 设置为 False 以加速和节省内存。
+    dict(
+        type='Resize',  # 变化图像和其注释大小的数据增广的流程。
+        img_scale=(1333, 800),  # 图像的最大规模。
+        keep_ratio=True
+    ),  # 是否保持图像的长宽比。
+    dict(
+        type='RandomFlip',  #  翻转图像和其注释大小的数据增广的流程。
+        flip_ratio=0.5),  # 翻转图像的概率。
+    dict(
+        type='Normalize',  # 归一化当前图像的数据增广的流程。
+        mean=[123.675, 116.28, 103.53],  # 这些键与 img_norm_cfg 一致，因为 img_norm_cfg 被
+        std=[58.395, 57.12, 57.375],     # 用作参数。
+        to_rgb=True),
+    dict(
+        type='Pad',  # 填充当前图像到指定大小的数据增广的流程。
+        size_divisor=32),  # 填充图像可以被当前值整除。
+    dict(type='DefaultFormatBundle'),  # 流程里收集数据的默认格式捆。
+    dict(
+        type='Collect',  # 决定数据中哪些键应该传递给检测器的流程
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),  # 第 1 个流程，从文件路径里加载图像。
+    dict(
+        type='MultiScaleFlipAug',  # 封装测试时数据增广(test time augmentations)。
+        img_scale=(1333, 800),  # 决定测试时可改变图像的最大规模。用于改变图像大小的流程。
+        flip=False,  # 测试时是否翻转图像。
+        transforms=[
+            dict(type='Resize',  # 使用改变图像大小的数据增广。
+                 keep_ratio=True),  # 是否保持宽和高的比例，这里的图像比例设置将覆盖上面的图像规模大小的设置。
+            dict(type='RandomFlip'),  # 考虑到 RandomFlip 已经被添加到流程里，当 flip=False 时它将不被使用。
+            dict(
+                type='Normalize',  #  归一化配置项，值来自 img_norm_cfg。
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(
+                type='Pad',  # 将配置传递给可被 32 整除的图像。
+                size_divisor=32),
+            dict(
+                type='ImageToTensor',  # 将图像转为张量
+                keys=['img']),
+            dict(
+                type='Collect',  # 收集测试时必须的键的收集流程。
+                keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=2,  # 单个 GPU 的 Batch size
+    workers_per_gpu=2,  # 单个 GPU 分配的数据加载线程数
+    train=dict(  # 训练数据集配置
+        type='CocoDataset',  # 数据集的类别, 更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py#L19。
+        ann_file='data/coco/annotations/instances_train2017.json',  # 注释文件路径
+        img_prefix='data/coco/train2017/',  # 图片路径前缀
+        pipeline=[  # 流程, 这是由之前创建的 train_pipeline 传递的。
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='LoadAnnotations',
+                with_bbox=True,
+                with_mask=True,
+                poly2mask=False),
+            dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='Normalize',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+        ]),
+    val=dict(  # 验证数据集的配置
+        type='CocoDataset',
+        ann_file='data/coco/annotations/instances_val2017.json',
+        img_prefix='data/coco/val2017/',
+        pipeline=[  # 由之前创建的 test_pipeline 传递的流程。
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(1333, 800),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip'),
+                    dict(
+                        type='Normalize',
+                        mean=[123.675, 116.28, 103.53],
+                        std=[58.395, 57.12, 57.375],
+                        to_rgb=True),
+                    dict(type='Pad', size_divisor=32),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(  # 测试数据集配置，修改测试开发/测试(test-dev/test)提交的 ann_file
+        type='CocoDataset',
+        ann_file='data/coco/annotations/instances_val2017.json',
+        img_prefix='data/coco/val2017/',
+        pipeline=[  # 由之前创建的 test_pipeline 传递的流程。
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(1333, 800),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip'),
+                    dict(
+                        type='Normalize',
+                        mean=[123.675, 116.28, 103.53],
+                        std=[58.395, 57.12, 57.375],
+                        to_rgb=True),
+                    dict(type='Pad', size_divisor=32),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ],
+        samples_per_gpu=2  # 单个 GPU 测试时的 Batch size
+    ))
+evaluation = dict(  # evaluation hook 的配置，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/evaluation/eval_hooks.py#L7。
+    interval=1,  # 验证的间隔。
+    metric=['bbox', 'segm'])  # 验证期间使用的指标。
+optimizer = dict(  # 用于构建优化器的配置文件。支持 PyTorch 中的所有优化器，同时它们的参数与 PyTorch 里的优化器参数一致。
+    type='SGD',  # 优化器种类，更多细节可参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/optimizer/default_constructor.py#L13。
+    lr=0.02,  # 优化器的学习率，参数的使用细节请参照对应的 PyTorch 文档。
+    momentum=0.9,  # 动量(Momentum)
+    weight_decay=0.0001)  # SGD 的衰减权重(weight decay)。
+optimizer_config = dict(  # optimizer hook 的配置文件，执行细节请参考 https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/optimizer.py#L8。
+    grad_clip=None)  # 大多数方法不使用梯度限制(grad_clip)。
+lr_config = dict(  # 学习率调整配置，用于注册 LrUpdater hook。
+    policy='step',  # 调度流程(scheduler)的策略，也支持 CosineAnnealing, Cyclic, 等。请从 https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9 参考 LrUpdater 的细节。
+    warmup='linear',  # 预热(warmup)策略，也支持 `exp` 和 `constant`。
+    warmup_iters=500,  # 预热的迭代次数
+    warmup_ratio=
+    0.001,  # 用于热身的起始学习率的比率
+    step=[8, 11])  # 衰减学习率的起止回合数
+runner = dict(
+    type='EpochBasedRunner',  # 将使用的 runner 的类别 (例如 IterBasedRunner 或 EpochBasedRunner)。
+    max_epochs=12) # runner 总回合数， 对于 IterBasedRunner 使用 `max_iters`
+checkpoint_config = dict(  # Checkpoint hook 的配置文件。执行时请参考 https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py。
+    interval=1)  # 保存的间隔是 1。
+log_config = dict(  # register logger hook 的配置文件。
+    interval=50,  # 打印日志的间隔
+    hooks=[ # 训练期间执行的钩子
+        dict(type='TextLoggerHook', by_epoch=False),
+        dict(type='TensorboardLoggerHook', by_epoch=False),
+        dict(type='MMDetWandbHook', by_epoch=False, # 还支持 Wandb 记录器，它需要安装 `wandb`。
+             init_kwargs={'entity': "OpenMMLab", # 用于登录wandb的实体
+                          'project': "MMDet", # WandB中的项目名称
+                          'config': cfg_dict}), # 检查 https://docs.wandb.ai/ref/python/init 以获取更多初始化参数
+    ])  # 用于记录训练过程的记录器(logger)。
+
+dist_params = dict(backend='nccl')  # 用于设置分布式训练的参数，端口也同样可被设置。
+log_level = 'INFO'  # 日志的级别。
+load_from = None  # 从一个给定路径里加载模型作为预训练模型，它并不会消耗训练时间。
+resume_from = None  # 从给定路径里恢复检查点(checkpoints)，训练模式将从检查点保存的轮次开始恢复训练。
+workflow = [('train', 1)]  # runner 的工作流程，[('train', 1)] 表示只有一个工作流且工作流仅执行一次。根据 total_epochs 工作流训练 12个回合。
+work_dir = 'work_dir'  # 用于保存当前实验的模型检查点和日志的目录。
+```
+
+## 常问问题 (FAQ)
+
+### 忽略基础配置文件里的部分内容
+
+有时，您也许会设置 `_delete_=True` 去忽略基础配置文件里的一些域内容。 您也许可以参照 [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html#inherit-from-base-config-with-ignored-fields) 来获得一些简单的指导。
+
+在 MMDetection里，例如为了改变  Mask R-CNN 的主干网络的某些内容：
+
+```python
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(...),
+    rpn_head=dict(...),
+    roi_head=dict(...))
+```
+
+基础配置的 `Mask R-CNN` 使用 `ResNet-50`，在需要将主干网络改成 `HRNet` 的时候，因为 `HRNet` 和 `ResNet` 中有不同的字段，需要使用 `_delete_=True` 将新的键去替换 `backbone` 域内所有老的键。
+
+```python
+_base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w32',
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256)))),
+    neck=dict(...))
+```
+
+### 使用配置文件里的中间变量
+
+配置文件里会使用一些中间变量，例如数据集里的 `train_pipeline`/`test_pipeline`。我们在定义新的 `train_pipeline`/`test_pipeline` 之后，需要将它们传递到 `data` 里。例如，我们想在训练或测试时，改变 Mask R-CNN 的多尺度策略 (multi scale strategy)，`train_pipeline`/`test_pipeline` 是我们想要修改的中间变量。
+
+```python
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                   (1333, 768), (1333, 800)],
+        multiscale_mode="value",
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+```
+
+我们首先定义新的 `train_pipeline`/`test_pipeline` 然后传递到 `data` 里。
+
+同样的，如果我们想从 `SyncBN` 切换到 `BN` 或者 `MMSyncBN`，我们需要修改配置文件里的每一个  `norm_cfg`。
+
+```python
+_base_ = './mask_rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    backbone=dict(norm_cfg=norm_cfg),
+    neck=dict(norm_cfg=norm_cfg),
+    ...)
+```
diff --git a/docs/zh_cn/tutorials/customize_dataset.md b/docs/zh_cn/tutorials/customize_dataset.md
new file mode 100755
index 0000000..8468e40
--- /dev/null
+++ b/docs/zh_cn/tutorials/customize_dataset.md
@@ -0,0 +1,456 @@
+# 教程 2: 自定义数据集
+
+## 支持新的数据格式
+
+为了支持新的数据格式，可以选择将数据转换成现成的格式（COCO 或者 PASCAL）或将其转换成中间格式。当然也可以选择以离线的形式（在训练之前使用脚本转换）或者在线的形式（实现一个新的 dataset 在训练中进行转换）来转换数据。
+
+在 MMDetection 中，建议将数据转换成 COCO 格式并以离线的方式进行，因此在完成数据转换后只需修改配置文件中的标注数据的路径和类别即可。
+
+### 将新的数据格式转换为现有的数据格式
+
+最简单的方法就是将你的数据集转换成现有的数据格式（COCO 或者 PASCAL VOC）
+
+COCO 格式的 json 标注文件有如下必要的字段：
+
+```python
+'images': [
+    {
+        'file_name': 'COCO_val2014_000000001268.jpg',
+        'height': 427,
+        'width': 640,
+        'id': 1268
+    },
+    ...
+],
+
+'annotations': [
+    {
+        'segmentation': [[192.81,
+            247.09,
+            ...
+            219.03,
+            249.06]],  # 如果有 mask 标签
+        'area': 1035.749,
+        'iscrowd': 0,
+        'image_id': 1268,
+        'bbox': [192.81, 224.8, 74.73, 33.43],
+        'category_id': 16,
+        'id': 42986
+    },
+    ...
+],
+
+'categories': [
+    {'id': 0, 'name': 'car'},
+ ]
+```
+
+在 json 文件中有三个必要的键：
+
+- `images`: 包含多个图片以及它们的信息的数组，例如 `file_name`、`height`、`width` 和 `id`。
+- `annotations`: 包含多个实例标注信息的数组。
+- `categories`: 包含多个类别名字和 ID 的数组。
+
+在数据预处理之后，使用现有的数据格式来训练自定义的新数据集有如下两步（以 COCO 为例）：
+
+1. 为自定义数据集修改配置文件。
+2. 检查自定义数据集的标注。
+
+这里我们举一个例子来展示上面的两个步骤，这个例子使用包括 5 个类别的 COCO 格式的数据集来训练一个现有的 Cascade Mask R-CNN R50-FPN 检测器
+
+#### 1. 为自定义数据集修改配置文件
+
+配置文件的修改涉及两个方面：
+
+1. `data` 部分。需要在 `data.train`、`data.val` 和 `data.test` 中添加 `classes`。
+2. `model` 部分中的 `num_classes`。需要将默认值（COCO 数据集中为 80）修改为自定义数据集中的类别数。
+
+`configs/my_custom_config.py` 内容如下：
+
+```python
+
+# 新的配置来自基础的配置以更好地说明需要修改的地方
+_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
+
+# 1. 数据集设定
+dataset_type = 'CocoDataset'
+classes = ('a', 'b', 'c', 'd', 'e')
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        # 将类别名字添加至 `classes` 字段中
+        classes=classes,
+        ann_file='path/to/your/train/annotation_data',
+        img_prefix='path/to/your/train/image_data'),
+    val=dict(
+        type=dataset_type,
+        # 将类别名字添加至 `classes` 字段中
+        classes=classes,
+        ann_file='path/to/your/val/annotation_data',
+        img_prefix='path/to/your/val/image_data'),
+    test=dict(
+        type=dataset_type,
+        # 将类别名字添加至 `classes` 字段中
+        classes=classes,
+        ann_file='path/to/your/test/annotation_data',
+        img_prefix='path/to/your/test/image_data'))
+
+# 2. 模型设置
+
+# 将所有的 `num_classes` 默认值修改为5（原来为80）
+model = dict(
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                # 将所有的 `num_classes` 默认值修改为 5（原来为 80）
+                num_classes=5),
+            dict(
+                type='Shared2FCBBoxHead',
+                # 将所有的 `num_classes` 默认值修改为 5（原来为 80）
+                num_classes=5),
+            dict(
+                type='Shared2FCBBoxHead',
+                # 将所有的 `num_classes` 默认值修改为 5（原来为 80）
+                num_classes=5)],
+    # 将所有的 `num_classes` 默认值修改为 5（原来为 80）
+    mask_head=dict(num_classes=5)))
+```
+
+#### 2. 检查自定义数据集的标注
+
+假设你自己的数据集是 COCO 格式，那么需要保证数据的标注没有问题：
+
+1. 标注文件中 `categories` 的长度要与配置中的 `classes` 元组长度相匹配，它们都表示有几类。（如例子中有 5 个类别）
+2. 配置文件中 `classes` 字段应与标注文件里 `categories` 下的 `name` 有相同的元素且顺序一致。MMDetection 会自动将 `categories` 中不连续的 `id` 映射成连续的索引，因此 `categories` 下的 `name`的字符串顺序会影响标签的索引。同时，配置文件中的 `classes` 的字符串顺序也会影响到预测框可视化时的标签。
+3. `annotations` 中的 `category_id` 必须是有效的值。比如所有 `category_id` 的值都应该属于 `categories` 中的 `id`。
+
+下面是一个有效标注的例子：
+
+```python
+
+'annotations': [
+    {
+        'segmentation': [[192.81,
+            247.09,
+            ...
+            219.03,
+            249.06]],  #如果有 mask 标签。
+        'area': 1035.749,
+        'iscrowd': 0,
+        'image_id': 1268,
+        'bbox': [192.81, 224.8, 74.73, 33.43],
+        'category_id': 16,
+        'id': 42986
+    },
+    ...
+],
+
+# MMDetection 会自动将 `categories` 中不连续的 `id` 映射成连续的索引。
+'categories': [
+    {'id': 1, 'name': 'a'}, {'id': 3, 'name': 'b'}, {'id': 4, 'name': 'c'}, {'id': 16, 'name': 'd'}, {'id': 17, 'name': 'e'},
+ ]
+```
+
+我们使用这种方式来支持 CityScapes 数据集。脚本在[cityscapes.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/dataset_converters/cityscapes.py) 并且我们提供了微调的[configs](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes).
+
+**注意**
+
+1. 对于实例分割数据集, **MMDetection 目前只支持评估 COCO 格式的 mask AP**.
+2. 推荐训练之前进行离线转换，这样就可以继续使用 `CocoDataset` 且只需修改标注文件的路径以及训练的种类。
+
+### 调整新的数据格式为中间格式
+
+如果不想将标注格式转换为 COCO 或者 PASCAL 格式也是可行的。实际上，我们定义了一种简单的标注格式并且与所有现有的数据格式兼容，也能进行离线或者在线转换。
+
+数据集的标注是包含多个字典（dict）的列表，每个字典（dict）都与一张图片对应。测试时需要用到 `filename`（相对路径）、`width` 和 `height` 三个字段；训练时则额外需要 `ann`。`ann` 也是至少包含了两个字段的字典：`bboxes` 和 `labels`，它们都是 numpy array。有些数据集可能会提供如：crowd/difficult/ignored bboxes 标注，那么我们使用 `bboxes_ignore` 以及 `labels_ignore` 来包含它们。
+
+下面给出一个例子。
+
+```python
+
+[
+    {
+        'filename': 'a.jpg',
+        'width': 1280,
+        'height': 720,
+        'ann': {
+            'bboxes': <np.ndarray, float32> (n, 4),
+            'labels': <np.ndarray, int64> (n, ),
+            'bboxes_ignore': <np.ndarray, float32> (k, 4),
+            'labels_ignore': <np.ndarray, int64> (k, ) （可选字段）
+        }
+    },
+    ...
+]
+```
+
+有两种方法处理自定义数据。
+
+- 在线转换（online conversion）
+
+  可以新写一个继承自 `CustomDataset` 的 Dataset 类，并重写 `load_annotations(self, ann_file)` 以及 `get_ann_info(self, idx)` 这两个方法，正如[CocoDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py)与[VOCDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/voc.py).
+
+- 离线转换（offline conversion）
+
+  可以将标注格式转换为上述的任意格式并将其保存为 pickle 或者 json 文件，例如[pascal_voc.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/dataset_converters/pascal_voc.py)。
+  然后使用`CustomDataset`。
+
+### 自定义数据集的例子：
+
+假设文本文件中表示的是一种全新的标注格式。边界框的标注信息保存在 `annotation.txt` 中，内容如下：
+
+```
+#
+000001.jpg
+1280 720
+2
+10 20 40 60 1
+20 40 50 60 2
+#
+000002.jpg
+1280 720
+3
+50 20 40 60 2
+20 40 30 45 2
+30 40 50 60 3
+```
+
+我们可以在 `mmdet/datasets/my_dataset.py` 中创建一个新的 dataset 用以加载数据。
+
+```python
+import mmcv
+import numpy as np
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class MyDataset(CustomDataset):
+
+    CLASSES = ('person', 'bicycle', 'car', 'motorcycle')
+
+    def load_annotations(self, ann_file):
+        ann_list = mmcv.list_from_file(ann_file)
+
+        data_infos = []
+        for i, ann_line in enumerate(ann_list):
+            if ann_line != '#':
+                continue
+
+            img_shape = ann_list[i + 2].split(' ')
+            width = int(img_shape[0])
+            height = int(img_shape[1])
+            bbox_number = int(ann_list[i + 3])
+
+            anns = ann_line.split(' ')
+            bboxes = []
+            labels = []
+            for anns in ann_list[i + 4:i + 4 + bbox_number]:
+                bboxes.append([float(ann) for ann in anns[:4]])
+                labels.append(int(anns[4]))
+
+            data_infos.append(
+                dict(
+                    filename=ann_list[i + 1],
+                    width=width,
+                    height=height,
+                    ann=dict(
+                        bboxes=np.array(bboxes).astype(np.float32),
+                        labels=np.array(labels).astype(np.int64))
+                ))
+
+        return data_infos
+
+    def get_ann_info(self, idx):
+        return self.data_infos[idx]['ann']
+
+```
+
+配置文件中，可以使用 `MyDataset` 进行如下修改
+
+```python
+dataset_A_train = dict(
+    type='MyDataset',
+    ann_file = 'image_list.txt',
+    pipeline=train_pipeline
+)
+```
+
+## 使用 dataset 包装器自定义数据集
+
+MMDetection 也支持非常多的数据集包装器（wrapper）来混合数据集或在训练时修改数据集的分布。
+最近 MMDetection 支持如下三种数据集包装：
+
+- `RepeatDataset`：将整个数据集简单地重复。
+- `ClassBalancedDataset`：以类别均衡的方式重复数据集。
+- `ConcatDataset`：合并数据集。
+
+### 重复数据集（Repeat dataset）
+
+使用 `RepeatDataset` 包装器来重复数据集。例如，假设原始数据集为 `Dataset_A`，重复它过后，其配置如下：
+
+```python
+dataset_A_train = dict(
+        type='RepeatDataset',
+        times=N,
+        dataset=dict(  # Dataset_A 的原始配置信息
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+### 类别均衡数据集（Class balanced dataset）
+
+使用 `ClassBalancedDataset` 作为包装器在类别的出现的频率上重复数据集。数据集需要实例化 `self.get_cat_ids(idx)` 函数以支持 `ClassBalancedDataset`。
+比如，以 `oversample_thr=1e-3` 来重复数据集 `Dataset_A`，其配置如下：
+
+```python
+dataset_A_train = dict(
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(  # Dataset_A 的原始配置信息
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+更多细节请参考[源码](../../mmdet/datasets/dataset_wrappers.py)。
+
+### 合并数据集（Concatenate dataset）
+
+合并数据集有三种方法：
+
+1. 如果要合并的数据集类型一致但有多个的标注文件，那么可以使用如下配置将其合并。
+
+   ```python
+   dataset_A_train = dict(
+       type='Dataset_A',
+       ann_file = ['anno_file_1', 'anno_file_2'],
+       pipeline=train_pipeline
+   )
+   ```
+
+   如果合并的数据集适用于测试或者评估，那么这种方式支持每个数据集分开进行评估。如果想要将合并的数据集作为整体用于评估，那么可以像如下一样设置 `separate_eval=False`。
+
+   ```python
+   dataset_A_train = dict(
+       type='Dataset_A',
+       ann_file = ['anno_file_1', 'anno_file_2'],
+       separate_eval=False,
+       pipeline=train_pipeline
+   )
+   ```
+
+2. 如果想要合并的是不同数据集，那么可以使用如下配置。
+
+   ```python
+   dataset_A_val = dict()
+   dataset_B_val = dict()
+
+   data = dict(
+       imgs_per_gpu=2,
+       workers_per_gpu=2,
+       train=dataset_A_train,
+       val=dict(
+           type='ConcatDataset',
+           datasets=[dataset_A_val, dataset_B_val],
+           separate_eval=False))
+   ```
+
+   只需设置 `separate_eval=False`，用户就可以将所有的数据集作为一个整体来评估。
+
+**注意**
+
+1. 在做评估时，`separate_eval=False` 选项是假设数据集使用了 `self.data_infos`。因此COCO数据集不支持此项操作，因为COCO数据集在做评估时并不是所有都依赖 `self.data_infos`。组合不同类型的数据集并将其作为一个整体来评估，这种做法没有得到测试，也不建议这样做。
+
+2. 因为不支持评估 `ClassBalancedDataset` 和 `RepeatDataset`，所以也不支持评估它们的组合。
+
+一个更复杂的例子则是分别将 `Dataset_A` 和 `Dataset_B` 重复N和M次，然后进行如下合并。
+
+```python
+dataset_A_train = dict(
+    type='RepeatDataset',
+    times=N,
+    dataset=dict(
+        type='Dataset_A',
+        ...
+        pipeline=train_pipeline
+    )
+)
+dataset_A_val = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_A_test = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_B_train = dict(
+    type='RepeatDataset',
+    times=M,
+    dataset=dict(
+        type='Dataset_B',
+        ...
+        pipeline=train_pipeline
+    )
+)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train = [
+        dataset_A_train,
+        dataset_B_train
+    ],
+    val = dataset_A_val,
+    test = dataset_A_test
+)
+
+```
+
+## 修改数据集的类别
+
+根据现有数据集的类型，我们可以修改它们的类别名称来训练其标注的子集。
+例如，如果只想训练当前数据集中的三个类别，那么就可以修改数据集的类别元组。
+数据集就会自动屏蔽掉其他类别的真实框。
+
+```python
+classes = ('person', 'bicycle', 'car')
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+```
+
+MMDetection V2.0 也支持从文件中读取类别名称，这种方式在实际应用中很常见。
+假设存在文件 `classes.txt`，其包含了如下的类别名称。
+
+```
+person
+bicycle
+car
+```
+
+用户可以将类别设置成文件路径，数据集就会自动将其加载并转换成一个列表。
+
+```python
+classes = 'path/to/classes.txt'
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+```
+
+**注意**
+
+- 在 MMDetection v2.5.0 之前，如果类别为集合时数据集将自动过滤掉不包含 GT 的图片，且没办法通过修改配置将其关闭。这是一种不可取的行为而且会引起混淆，因为当类别不是集合时数据集只有在 `filter_empty_gt=True` 以及 `test_mode=False` 的情况下才会过滤掉不包含 GT 的图片。在 MMDetection v2.5.0 之后，我们将图片的过滤以及类别的修改进行解耦，如，数据集只有在 `filter_empty_gt=True` 和 `test_mode=False` 的情况下才会过滤掉不包含 GT 的图片，无论类别是否为集合。设置类别只会影响用于训练的标注类别，用户可以自行决定是否过滤不包含 GT 的图片。
+- 因为中间格式只有框的标签并不包含类别的名字，所以使用 `CustomDataset` 时用户不能通过修改配置来过滤不含 GT 的图片。但是可以通过离线的方式来解决。
+- 当设置数据集中的 `classes` 时，记得修改 `num_classes`。从 v2.9.0 (PR#4508) 之后，我们实现了[NumClassCheckHook](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/utils.py)来检查类别数是否一致。
+- 我们在未来将会重构设置数据集类别以及数据集过滤的特性，使其更加地方便用户使用。
diff --git a/docs/zh_cn/tutorials/customize_losses.md b/docs/zh_cn/tutorials/customize_losses.md
new file mode 100755
index 0000000..f721e77
--- /dev/null
+++ b/docs/zh_cn/tutorials/customize_losses.md
@@ -0,0 +1,125 @@
+# 教程 6: 自定义损失函数
+
+MMDetection 为用户提供了不同的损失函数。但是默认的配置可能无法适应不同的数据和模型，所以用户可能会希望修改某一个损失函数来适应新的情况。
+
+本教程首先详细的解释计算损失的过程然后给出一些关于如何修改每一个步骤的指导。对损失的修改可以被分为微调和加权。
+
+## 一个损失的计算过程
+
+给定输入（包括预测和目标，以及权重），损失函数会把输入的张量映射到最后的损失标量。映射过程可以分为下面五个步骤：
+
+1. 设置采样方法为对正负样本进行采样。
+
+2. 通过损失核函数获取**元素**或者**样本**损失。
+
+3. 通过权重张量来给损失**逐元素**权重。
+
+4. 把损失张量归纳为一个**标量**。
+
+5. 用一个**张量**给当前损失一个权重。
+
+## 设置采样方法（步骤 1）
+
+对于一些损失函数，需要采样策略来避免正负样本之间的不平衡。
+
+例如，在RPN head中使用`CrossEntropyLoss`时，我们需要在`train_cfg`中设置`RandomSampler`
+
+```python
+train_cfg=dict(
+    rpn=dict(
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False))
+```
+
+对于其他一些具有正负样本平衡机制的损失，例如 Focal Loss、GHMC 和 QualityFocalLoss，不再需要进行采样。
+
+## 微调损失
+
+微调一个损失主要与步骤 2，4，5 有关，大部分的修改可以在配置文件中指定。这里我们用 [Focal Loss (FL)](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/focal_loss.py) 作为例子。
+下面的代码分别是构建 FL 的方法和它的配置文件，他们是一一对应的。
+
+```python
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0):
+```
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=1.0)
+```
+
+### 微调超参数（步骤2）
+
+`gamma` 和 `beta` 是 Focal Loss 中的两个超参数。如果我们想把 `gamma` 的值设为 1.5，把 `alpha` 的值设为 0.5，我们可以在配置文件中按照如下指定：
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=1.5,
+    alpha=0.5,
+    loss_weight=1.0)
+```
+
+### 微调归纳方式（步骤4）
+
+Focal Loss 默认的归纳方式是 `mean`。如果我们想把归纳方式从 `mean` 改成 `sum`，我们可以在配置文件中按照如下指定：
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=1.0,
+    reduction='sum')
+```
+
+### 微调损失权重（步骤5）
+
+这里的损失权重是一个标量，他用来控制多任务学习中不同损失的重要程度，例如，分类损失和回归损失。如果我们想把分类损失的权重设为 0.5，我们可以在配置文件中如下指定：
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=0.5)
+```
+
+## 加权损失（步骤3）
+
+加权损失就是我们逐元素修改损失权重。更具体来说，我们给损失张量乘以一个与他有相同形状的权重张量。所以，损失中不同的元素可以被赋予不同的比例，所以这里叫做逐元素。损失的权重在不同模型中变化很大，而且与上下文相关，但是总的来说主要有两种损失权重：分类损失的 `label_weights` 和边界框的 `bbox_weights`。你可以在相应的头中的 `get_target` 方法中找到他们。这里我们使用 [ATSSHead](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/atss_head.py#L530) 作为一个例子。它继承了 [AnchorHead](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/anchor_head.py)，但是我们重写它的
+`get_targets` 方法来产生不同的 `label_weights` 和 `bbox_weights`。
+
+```
+class ATSSHead(AnchorHead):
+
+    ...
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+```
diff --git a/docs/zh_cn/tutorials/customize_models.md b/docs/zh_cn/tutorials/customize_models.md
new file mode 100755
index 0000000..b29254a
--- /dev/null
+++ b/docs/zh_cn/tutorials/customize_models.md
@@ -0,0 +1,359 @@
+# 教程 4: 自定义模型
+
+我们简单地把模型的各个组件分为五类：
+
+- 主干网络 (backbone)：通常是一个用来提取特征图 (feature map) 的全卷积网络 (FCN network)，例如：ResNet, MobileNet。
+- Neck：主干网络和 Head 之间的连接部分，例如：FPN, PAFPN。
+- Head：用于具体任务的组件，例如：边界框预测和掩码预测。
+- 区域提取器 (roi extractor)：从特征图中提取 RoI 特征，例如：RoI Align。
+- 损失 (loss)：在 Head 组件中用于计算损失的部分，例如：FocalLoss, L1Loss, GHMLoss.
+
+## 开发新的组件
+
+### 添加一个新的主干网络
+
+这里，我们以 MobileNet 为例来展示如何开发新组件。
+
+#### 1. 定义一个新的主干网络（以 MobileNet 为例）
+
+新建一个文件 `mmdet/models/backbones/mobilenet.py`
+
+```python
+import torch.nn as nn
+
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class MobileNet(nn.Module):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # should return a tuple
+        pass
+```
+
+#### 2. 导入该模块
+
+你可以添加下述代码到 `mmdet/models/backbones/__init__.py`
+
+```python
+from .mobilenet import MobileNet
+```
+
+或添加：
+
+```python
+custom_imports = dict(
+    imports=['mmdet.models.backbones.mobilenet'],
+    allow_failed_imports=False)
+```
+
+到配置文件以避免原始代码被修改。
+
+#### 3. 在你的配置文件中使用该主干网络
+
+```python
+model = dict(
+    ...
+    backbone=dict(
+        type='MobileNet',
+        arg1=xxx,
+        arg2=xxx),
+    ...
+```
+
+### 添加新的 Neck
+
+#### 1. 定义一个 Neck（以 PAFPN 为例）
+
+新建一个文件 `mmdet/models/necks/pafpn.py`
+
+```python
+from ..builder import NECKS
+
+@NECKS.register_module()
+class PAFPN(nn.Module):
+
+    def __init__(self,
+                in_channels,
+                out_channels,
+                num_outs,
+                start_level=0,
+                end_level=-1,
+                add_extra_convs=False):
+        pass
+
+    def forward(self, inputs):
+        # implementation is ignored
+        pass
+```
+
+#### 2. 导入该模块
+
+你可以添加下述代码到 `mmdet/models/necks/__init__.py`
+
+```python
+from .pafpn import PAFPN
+```
+
+或添加：
+
+```python
+custom_imports = dict(
+    imports=['mmdet.models.necks.pafpn.py'],
+    allow_failed_imports=False)
+```
+
+到配置文件以避免原始代码被修改。
+
+#### 3. 修改配置文件
+
+```python
+neck=dict(
+    type='PAFPN',
+    in_channels=[256, 512, 1024, 2048],
+    out_channels=256,
+    num_outs=5)
+```
+
+### 添加新的 Head
+
+我们以 [Double Head R-CNN](https://arxiv.org/abs/1904.06493) 为例来展示如何添加一个新的 Head。
+
+首先，添加一个新的 bbox head 到 `mmdet/models/roi_heads/bbox_heads/double_bbox_head.py`。
+Double Head R-CNN 在目标检测上实现了一个新的 bbox head。为了实现 bbox head，我们需要使用如下的新模块中三个函数。
+
+```python
+from mmdet.models.builder import HEADS
+from .bbox_head import BBoxHead
+
+@HEADS.register_module()
+class DoubleConvFCBBoxHead(BBoxHead):
+    r"""Bbox head used in Double-Head R-CNN
+
+                                      /-> cls
+                  /-> shared convs ->
+                                      \-> reg
+    roi features
+                                      /-> cls
+                  \-> shared fc    ->
+                                      \-> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_convs=0,
+                 num_fcs=0,
+                 conv_out_channels=1024,
+                 fc_out_channels=1024,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 **kwargs):
+        kwargs.setdefault('with_avg_pool', True)
+        super(DoubleConvFCBBoxHead, self).__init__(**kwargs)
+
+
+    def forward(self, x_cls, x_reg):
+
+```
+
+然后，如有必要，实现一个新的 bbox head。我们打算从 `StandardRoIHead` 来继承新的 `DoubleHeadRoIHead`。我们可以发现 `StandardRoIHead` 已经实现了下述函数。
+
+```python
+import torch
+
+from mmdet.core import bbox2result, bbox2roi, build_assigner, build_sampler
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_roi_head import BaseRoIHead
+from .test_mixins import BBoxTestMixin, MaskTestMixin
+
+
+@HEADS.register_module()
+class StandardRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin):
+    """Simplest base roi head including one bbox head and one mask head.
+    """
+
+    def init_assigner_sampler(self):
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+
+
+    def forward_dummy(self, x, proposals):
+
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+
+    def _bbox_forward(self, x, rois):
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+
+    def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
+
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation."""
+
+```
+
+Double Head 的修改主要在 bbox_forward 的逻辑中，且它从 `StandardRoIHead` 中继承了其他逻辑。在 `mmdet/models/roi_heads/double_roi_head.py` 中，我们用下述代码实现新的 bbox head：
+
+```python
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class DoubleHeadRoIHead(StandardRoIHead):
+    """RoI head for Double Head RCNN
+
+    https://arxiv.org/abs/1904.06493
+    """
+
+    def __init__(self, reg_roi_scale_factor, **kwargs):
+        super(DoubleHeadRoIHead, self).__init__(**kwargs)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x, rois):
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
+```
+
+最终，用户需要把该模块添加到 `mmdet/models/bbox_heads/__init__.py` 和 `mmdet/models/roi_heads/__init__.py` 以使相关的注册表可以找到并加载他们。
+
+或者，用户可以添加：
+
+```python
+custom_imports=dict(
+    imports=['mmdet.models.roi_heads.double_roi_head', 'mmdet.models.bbox_heads.double_bbox_head'])
+```
+
+到配置文件并实现相同的目的。
+
+Double Head R-CNN 的配置文件如下：
+
+```python
+_base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        type='DoubleHeadRoIHead',
+        reg_roi_scale_factor=1.3,
+        bbox_head=dict(
+            _delete_=True,
+            type='DoubleConvFCBBoxHead',
+            num_convs=4,
+            num_fcs=2,
+            in_channels=256,
+            conv_out_channels=1024,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0))))
+
+```
+
+从 MMDetection 2.0 版本起，配置系统支持继承配置以使用户可以专注于修改。
+Double Head R-CNN 主要使用了一个新的 DoubleHeadRoIHead 和一个新的 `DoubleConvFCBBoxHead`，参数需要根据每个模块的 `__init__` 函数来设置。
+
+### 添加新的损失
+
+假设你想添加一个新的损失 `MyLoss` 用于边界框回归。
+为了添加一个新的损失函数，用户需要在 `mmdet/models/losses/my_loss.py` 中实现。
+装饰器 `weighted_loss` 可以使损失每个部分加权。
+
+```python
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+@weighted_loss
+def my_loss(pred, target):
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+@LOSSES.register_module()
+class MyLoss(nn.Module):
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(MyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * my_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+```
+
+然后，用户需要把它加到 `mmdet/models/losses/__init__.py`。
+
+```python
+from .my_loss import MyLoss, my_loss
+
+```
+
+或者，你可以添加：
+
+```python
+custom_imports=dict(
+    imports=['mmdet.models.losses.my_loss'])
+```
+
+到配置文件来实现相同的目的。
+
+如使用，请修改 `loss_xxx` 字段。
+因为 MyLoss 是用于回归的，你需要在 Head 中修改 `loss_xxx` 字段。
+
+```python
+loss_bbox=dict(type='MyLoss', loss_weight=1.0))
+```
diff --git a/docs/zh_cn/tutorials/customize_runtime.md b/docs/zh_cn/tutorials/customize_runtime.md
new file mode 100755
index 0000000..8d998c3
--- /dev/null
+++ b/docs/zh_cn/tutorials/customize_runtime.md
@@ -0,0 +1 @@
+# 教程 5: 自定义训练配置
diff --git a/docs/zh_cn/tutorials/data_pipeline.md b/docs/zh_cn/tutorials/data_pipeline.md
new file mode 100755
index 0000000..2fd7f8f
--- /dev/null
+++ b/docs/zh_cn/tutorials/data_pipeline.md
@@ -0,0 +1,190 @@
+# 教程 3: 自定义数据预处理流程
+
+## 数据流程的设计
+
+按照惯例，我们使用 `Dataset` 和 `DataLoader` 进行多进程的数据加载。`Dataset` 返回字典类型的数据，数据内容为模型 `forward` 方法的各个参数。由于在目标检测中，输入的图像数据具有不同的大小，我们在 `MMCV` 里引入一个新的 `DataContainer` 类去收集和分发不同大小的输入数据。更多细节请参考[这里](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py)。
+
+数据的准备流程和数据集是解耦的。通常一个数据集定义了如何处理标注数据（annotations）信息，而一个数据流程定义了准备一个数据字典的所有步骤。一个流程包括一系列的操作，每个操作都把一个字典作为输入，然后再输出一个新的字典给下一个变换操作。
+
+我们在下图展示了一个经典的数据处理流程。蓝色块是数据处理操作，随着数据流程的处理，每个操作都可以在结果字典中加入新的键（标记为绿色）或更新现有的键（标记为橙色）。
+
+![pipeline figure](../../../resources/data_pipeline.png)
+
+这些操作可以分为数据加载（data loading）、预处理（pre-processing）、格式变化（formatting）和测试时数据增强（test-time augmentation）。
+
+下面的例子是 `Faster R-CNN` 的一个流程：
+
+```python
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+```
+
+对于每个操作，我们列出它添加、更新、移除的相关字典域 (dict fields)：
+
+### 数据加载 Data loading
+
+`LoadImageFromFile`
+
+- 增加：img, img_shape, ori_shape
+
+`LoadAnnotations`
+
+- 增加：gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg, bbox_fields, mask_fields
+
+`LoadProposals`
+
+- 增加：proposals
+
+### 预处理 Pre-processing
+
+`Resize`
+
+- 增加：scale, scale_idx, pad_shape, scale_factor, keep_ratio
+- 更新：img, img_shape, \*bbox_fields, \*mask_fields, \*seg_fields
+
+`RandomFlip`
+
+- 增加：flip
+- 更新：img, \*bbox_fields, \*mask_fields, \*seg_fields
+
+`Pad`
+
+- 增加：pad_fixed_size, pad_size_divisor
+- 更新：img, pad_shape, \*mask_fields, \*seg_fields
+
+`RandomCrop`
+
+- 更新：img, pad_shape, gt_bboxes, gt_labels, gt_masks, \*bbox_fields
+
+`Normalize`
+
+- 增加：img_norm_cfg
+- 更新：img
+
+`SegRescale`
+
+- 更新：gt_semantic_seg
+
+`PhotoMetricDistortion`
+
+- 更新：img
+
+`Expand`
+
+- 更新：img, gt_bboxes
+
+`MinIoURandomCrop`
+
+- 更新：img, gt_bboxes, gt_labels
+
+`Corrupt`
+
+- 更新：img
+
+### 格式 Formatting
+
+`ToTensor`
+
+- 更新：由 `keys` 指定
+
+`ImageToTensor`
+
+- 更新：由 `keys` 指定
+
+`Transpose`
+
+- 更新：由 `keys` 指定
+
+`ToDataContainer`
+
+- 更新：由 `keys` 指定
+
+`DefaultFormatBundle`
+
+- 更新：img, proposals, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg
+
+`Collect`
+
+- 增加：img_metas（img_metas 的键（key）被 `meta_keys` 指定)
+- 移除：除了 `keys` 指定的键（key）之外的所有其他的键（key）
+
+### 测试时数据增强 Test time augmentation
+
+`MultiScaleFlipAug`
+
+## 拓展和使用自定义的流程
+
+1. 在任意文件里写一个新的流程，例如在 `my_pipeline.py`，它以一个字典作为输入并且输出一个字典：
+
+   ```python
+   import random
+   from mmdet.datasets import PIPELINES
+
+
+   @PIPELINES.register_module()
+   class MyTransform:
+       """Add your transform
+
+       Args:
+           p (float): Probability of shifts. Default 0.5.
+       """
+
+       def __init__(self, p=0.5):
+           self.p = p
+
+       def __call__(self, results):
+           if random.random() > self.p:
+               results['dummy'] = True
+           return results
+   ```
+
+2. 在配置文件里调用并使用你写的数据处理流程，需要确保你的训练脚本能够正确导入新增模块：
+
+   ```python
+   custom_imports = dict(imports=['path.to.my_pipeline'], allow_failed_imports=False)
+
+   img_norm_cfg = dict(
+       mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+   train_pipeline = [
+       dict(type='LoadImageFromFile'),
+       dict(type='LoadAnnotations', with_bbox=True),
+       dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+       dict(type='RandomFlip', flip_ratio=0.5),
+       dict(type='Normalize', **img_norm_cfg),
+       dict(type='Pad', size_divisor=32),
+       dict(type='MyTransform', p=0.2),
+       dict(type='DefaultFormatBundle'),
+       dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+   ]
+   ```
+
+3. 可视化数据增强处理流程的结果
+
+   如果想要可视化数据增强处理流程的结果，可以使用 `tools/misc/browse_dataset.py` 直观
+   地浏览检测数据集（图像和标注信息），或将图像保存到指定目录。
+   使用方法请参考[日志分析](../useful_tools.md)
diff --git a/docs/zh_cn/tutorials/finetune.md b/docs/zh_cn/tutorials/finetune.md
new file mode 100755
index 0000000..349660e
--- /dev/null
+++ b/docs/zh_cn/tutorials/finetune.md
@@ -0,0 +1,87 @@
+# 教程 7: 模型微调
+
+在 COCO 数据集上预训练的检测器可以作为其他数据集（例如 CityScapes 和 KITTI 数据集）优质的预训练模型。
+本教程将指导用户如何把 [ModelZoo](../model_zoo.md) 中提供的模型用于其他数据集中并使得当前所训练的模型获得更好性能。
+
+以下是在新数据集中微调模型需要的两个步骤。
+
+- 按 [教程2：自定义数据集的方法](customize_dataset.md) 中的方法对新数据集添加支持中的方法对新数据集添加支持
+- 按照本教程中所讨论方法，修改配置信息
+
+接下来将会以 Cityscapes Dataset 上的微调过程作为例子，具体讲述用户需要在配置中修改的五个部分。
+
+## 继承基础配置
+
+为了减轻编写整个配置的负担并减少漏洞的数量， MMDetection V2.0 支持从多个现有配置中继承配置信息。微调 MaskRCNN 模型的时候，新的配置信息需要使用从 `_base_/models/mask_rcnn_r50_fpn.py`中继承的配置信息来构建模型的基本结构。当使用 Cityscapes 数据集时，新的配置信息可以简便地从`_base_/datasets/cityscapes_instance.py`中继承。对于训练过程的运行设置部分，新配置需要从 `_base_/default_runtime.py`中继承。这些配置文件`configs`的目录下，用户可以选择全部内容的重新编写而不是使用继承方法。
+
+```python
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py'
+]
+```
+
+## Head 的修改
+
+接下来新的配置还需要根据新数据集的类别数量对 Head 进行修改。只需要对 roi_head 中的 `num_classes`进行修改。修改后除了最后的预测模型的 Head 之外，预训练模型的权重的大部分都会被重新使用。
+
+```python
+model = dict(
+    pretrained=None,
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=8,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=8,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+```
+
+## 数据集的修改
+
+用户可能还需要准备数据集并编写有关数据集的配置。目前 MMDetection V2.0 的配置文件已经支持 VOC、WIDER FACE、COCO 和 Cityscapes Dataset 的数据集信息。
+
+## 训练策略的修改
+
+微调超参数与默认的训练策略不同。它通常需要更小的学习率和更少的训练回合。
+
+```python
+# 优化器
+# batch size 为 8 时的 lr 配置
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# 学习策略
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[7])
+# lr_config 中的 max_epochs 和 step 需要针对自定义数据集进行专门调整
+runner = dict(max_epochs=8)
+log_config = dict(interval=100)
+```
+
+## 使用预训练模型
+
+如果要使用预训练模型时，可以在 `load_from` 中查阅新的配置信息，用户需要在训练开始之前下载好需要的模型权重，从而避免在训练过程中浪费了宝贵时间。
+
+```python
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
+```
diff --git a/docs/zh_cn/tutorials/how_to.md b/docs/zh_cn/tutorials/how_to.md
new file mode 100755
index 0000000..3587d32
--- /dev/null
+++ b/docs/zh_cn/tutorials/how_to.md
@@ -0,0 +1,203 @@
+# 教程 11: How to xxx
+
+本教程收集了任何如何使用 MMDetection 进行 xxx 的答案。 如果您遇到有关`如何做`的问题及答案，请随时更新此文档！
+
+## 使用 MMClassification 的骨干网络
+
+MMDet、MMCls、MMSeg 中的模型注册表都继承自 MMCV 中的根注册表，允许这些存储库直接使用彼此已经实现的模块。 因此用户可以在 MMDetection 中使用来自 MMClassification 的骨干网络，而无需实现MMClassification 中已经存在的网络。
+
+### 使用在 MMClassification 中实现的骨干网络
+
+假设想将 `MobileNetV3-small` 作为 `RetinaNet` 的骨干网络，则配置文件如下。
+
+```python
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+pretrained = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True, # 将 _base_ 中关于 backbone 的字段删除
+        type='mmcls.MobileNetV3', # 使用 mmcls 中的 MobileNetV3
+        arch='small',
+        out_indices=(3, 8, 11), # 修改 out_indices
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=pretrained,
+            prefix='backbone.')), # MMCls 中骨干网络的预训练权重含义 prefix='backbone.'，为了正常加载权重，需要把这个 prefix 去掉。
+    # 修改 in_channels
+    neck=dict(in_channels=[24, 48, 96], start_level=0))
+```
+
+### 通过 MMClassification 使用 TIMM 中实现的骨干网络
+
+由于 MMClassification 提供了 Py**T**orch **Im**age **M**odels (`timm`) 骨干网络的封装，用户也可以通过 MMClassification 直接使用 `timm` 中的骨干网络。假设想将 [`EfficientNet-B1`](https://github.com/open-mmlab/mmdetection/blob/master/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py) 作为 `RetinaNet` 的骨干网络，则配置文件如下。
+
+```python
+# https://github.com/open-mmlab/mmdetection/blob/master/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True, # 将 _base_ 中关于 backbone 的字段删除
+        type='mmcls.TIMMBackbone', # 使用 mmcls 中 timm 骨干网络
+        model_name='efficientnet_b1',
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)), # 修改 out_indices
+    neck=dict(in_channels=[24, 40, 112, 320])) # 修改 in_channels
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+```
+
+`type='mmcls.TIMMBackbone'` 表示在 MMDetection 中使用 MMClassification 中的 `TIMMBackbone` 类，并且使用的模型为` EfficientNet-B1`，其中 `mmcls` 表示 MMClassification 库，而 `TIMMBackbone ` 表示 MMClassification 中实现的 TIMMBackbone 包装器。
+
+关于层次注册器的具体原理可以参考 [MMCV 文档](https://github.com/open-mmlab/mmcv/blob/master/docs/zh_cn/understand_mmcv/registry.md#%E6%B3%A8%E5%86%8C%E5%99%A8%E5%B1%82%E7%BB%93%E6%9E%84)，关于如何使用 MMClassification 中的其他 backbone，可以参考 [MMClassification 文档](https://github.com/open-mmlab/mmclassification/blob/master/docs/zh_CN/tutorials/config.md)。
+
+## 使用马赛克数据增强
+
+如果你想在训练中使用 `Mosaic`，那么请确保你同时使用 `MultiImageMixDataset`。以 `Faster R-CNN` 算法为例，你可以通过如下做法实现：
+
+```python
+# 直接打开 configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py ,增添如下字段
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+img_scale=(1333, 800)​
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)), # 图像经过马赛克处理后会放大4倍，所以我们使用仿射变换来恢复图像的大小。
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+
+train_dataset = dict(
+    _delete_ = True, # 删除不必要的设置
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_empty_gt=False,
+    ),
+    pipeline=train_pipeline
+    )
+​
+data = dict(
+    train=train_dataset
+    )
+```
+
+## 在配置文件中冻结骨干网络后在训练中解冻骨干网络
+
+如果你在配置文件中已经冻结了骨干网络并希望在几个训练周期后解冻它，你可以通过 hook 来实现这个功能。以用 ResNet 为骨干网络的 Faster R-CNN 为例，你可以冻结一个骨干网络的一个层并在配置文件中添加如下 `custom_hooks`:
+
+```python
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    # freeze one stage of the backbone network.
+    backbone=dict(frozen_stages=1),
+)
+custom_hooks = [dict(type="UnfreezeBackboneEpochBasedHook", unfreeze_epoch=1)]
+```
+
+同时在 `mmdet/core/hook/unfreeze_backbone_epoch_based_hook.py` 当中书写 `UnfreezeBackboneEpochBasedHook` 类
+
+```python
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner.hooks import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class UnfreezeBackboneEpochBasedHook(Hook):
+    """Unfreeze backbone network Hook.
+
+    Args:
+        unfreeze_epoch (int): The epoch unfreezing the backbone network.
+    """
+
+    def __init__(self, unfreeze_epoch=1):
+        self.unfreeze_epoch = unfreeze_epoch
+
+    def before_train_epoch(self, runner):
+        # Unfreeze the backbone network.
+        # Only valid for resnet.
+        if runner.epoch == self.unfreeze_epoch:
+            model = runner.model
+            if is_module_wrapper(model):
+                model = model.module
+            backbone = model.backbone
+            if backbone.frozen_stages >= 0:
+                if backbone.deep_stem:
+                    backbone.stem.train()
+                    for param in backbone.stem.parameters():
+                        param.requires_grad = True
+                else:
+                    backbone.norm1.train()
+                    for m in [backbone.conv1, backbone.norm1]:
+                        for param in m.parameters():
+                            param.requires_grad = True
+
+            for i in range(1, backbone.frozen_stages + 1):
+                m = getattr(backbone, f'layer{i}')
+                m.train()
+                for param in m.parameters():
+                    param.requires_grad = True
+```
+
+## 获得新的骨干网络的通道数
+
+如果你想获得一个新骨干网络的通道数，你可以单独构建这个骨干网络并输入一个伪造的图片来获取每一个阶段的输出。
+
+以 `ResNet` 为例：
+
+```python
+from mmdet.models import ResNet
+import torch
+self = ResNet(depth=18)
+self.eval()
+inputs = torch.rand(1, 3, 32, 32)
+level_outputs = self.forward(inputs)
+for level_out in level_outputs:
+    print(tuple(level_out.shape))
+
+```
+
+以上脚本的输出为:
+
+```python
+(1, 64, 8, 8)
+(1, 128, 4, 4)
+(1, 256, 2, 2)
+(1, 512, 1, 1)
+```
+
+用户可以通过将脚本中的 `ResNet(depth=18)` 替换为自己的骨干网络配置来得到新的骨干网络的通道数。
diff --git a/docs/zh_cn/tutorials/index.rst b/docs/zh_cn/tutorials/index.rst
new file mode 100755
index 0000000..eaf4907
--- /dev/null
+++ b/docs/zh_cn/tutorials/index.rst
@@ -0,0 +1,14 @@
+.. toctree::
+   :maxdepth: 2
+
+   config.md
+   customize_dataset.md
+   data_pipeline.md
+   customize_models.md
+   customize_runtime.md
+   customize_losses.md
+   finetune.md
+   pytorch2onnx.md
+   onnx2tensorrt.md
+   init_cfg.md
+   how_to.md
diff --git a/docs/zh_cn/tutorials/init_cfg.md b/docs/zh_cn/tutorials/init_cfg.md
new file mode 100755
index 0000000..f6f5968
--- /dev/null
+++ b/docs/zh_cn/tutorials/init_cfg.md
@@ -0,0 +1,161 @@
+# 教程 10: 权重初始化
+
+在训练过程中，适当的初始化策略有利于加快训练速度或获得更⾼的性能。 [MMCV](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/weight_init.py) 提供了一些常⽤的初始化模块的⽅法，如 `nn.Conv2d`。 MMdetection 中的模型初始化主要使⽤ `init_cfg`。⽤⼾可以通过以下两个步骤来初始化模型：
+
+1. 在 `model_cfg` 中为模型或其组件定义 `init_cfg`，但⼦组件的 `init_cfg` 优先级更⾼，会覆盖⽗模块的 `init_cfg` 。
+2. 像往常一样构建模型，然后显式调⽤ `model.init_weights()` ⽅法，此时模型参数将会被按照配置文件写法进行初始化。
+
+MMdetection 初始化工作流的高层 API 调用流程是：
+
+model_cfg(init_cfg) -> build_from_cfg -> model -> init_weight() -> initialize(self, self.init_cfg) -> children's init_weight()
+
+### 描述
+
+它的数据类型是 dict 或者 list\[dict\]，包含了下列键值:
+
+- `type` (str)，包含 `INTIALIZERS` 中的初始化器名称，后面跟着初始化器的参数。
+- `layer`（str 或 list\[str\]），包含 Pytorch 或 MMCV 中基本层的名称，以及将被初始化的可学习参数，例如 `'Conv2d'`，`'DeformConv2d'`。
+- `override` (dict 或 list\[dict\])，包含不继承⾃ `BaseModule` 且其初始化配置与 `layer` 键中的其他层不同的⼦模块。 `type` 中定义的初始化器将适⽤于 `layer` 中定义的所有层，因此如果⼦模块不是 `BaseModule` 的派⽣类但可以与 `layer` 中的层相同的⽅式初始化，则不需要使⽤ `override`。`override` 包含了：
+  - `type` 后跟初始化器的参数；
+  - `name` 用以指⽰将被初始化的⼦模块。
+
+### 初始化参数
+
+从 `mmcv.runner.BaseModule` 或 `mmdet.models` 继承一个新模型。这里我们用 FooModel 来举个例子。
+
+```python
+import torch.nn as nn
+from mmcv.runner import BaseModule
+
+class FooModel(BaseModule)
+	def __init__(self,
+                 arg1,
+                 arg2,
+                 init_cfg=None):
+    	super(FooModel, self).__init__(init_cfg)
+		...
+```
+
+- 直接在代码中使⽤ `init_cfg` 初始化模型
+
+  ```python
+  import torch.nn as nn
+  from mmcv.runner import BaseModule
+  # or directly inherit mmdet models
+
+  class FooModel(BaseModule)
+  	def __init__(self,
+                  arg1,
+                  arg2,
+                  init_cfg=XXX):
+    		super(FooModel, self).__init__(init_cfg)
+    	    ...
+  ```
+
+- 在 `mmcv.Sequential` 或 `mmcv.ModuleList` 代码中直接使⽤ `init_cfg` 初始化模型
+
+  ```python
+  from mmcv.runner import BaseModule, ModuleList
+
+  class FooModel(BaseModule)
+  	def __init__(self,
+                	arg1,
+                	arg2,
+                	init_cfg=None):
+    		super(FooModel, self).__init__(init_cfg)
+        	...
+        	self.conv1 = ModuleList(init_cfg=XXX)
+  ```
+
+- 使⽤配置⽂件中的 `init_cfg` 初始化模型
+
+  ```python
+  model = dict(
+  	...
+    	model = dict(
+        	type='FooModel',
+        	arg1=XXX,
+        	arg2=XXX,
+        	init_cfg=XXX),
+            ...
+  ```
+
+### init_cfg 的使用
+
+1. 用 `layer` 键初始化模型
+
+   如果我们只定义了 `layer`, 它只会在 `layer` 键中初始化网络层。
+
+   注意： `layer` 键对应的值是 Pytorch 的带有 weights 和 bias 属性的类名（因此不⽀持 `MultiheadAttention` 层）。
+
+- 定义⽤于初始化具有相同配置的模块的 `layer` 键。
+
+  ```python
+  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
+  # ⽤相同的配置初始化整个模块
+  ```
+
+- 定义⽤于初始化具有不同配置的层的 `layer` 键。
+
+  ```python
+  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+              dict(type='Constant', layer='Conv2d', val=2),
+              dict(type='Constant', layer='Linear', val=3)]
+  # nn.Conv1d 将被初始化为 dict(type='Constant', val=1)
+  # nn.Conv2d 将被初始化为 dict(type='Constant', val=2)
+  # nn.Linear 将被初始化为 dict(type='Constant', val=3)
+  ```
+
+2. 使⽤ `override` 键初始化模型
+
+- 当使⽤属性名初始化某些特定部分时，我们可以使⽤ `override` 键， `override` 中的值将忽略 init_cfg 中的值。
+
+  ```python
+  # layers：
+  # self.feat = nn.Conv1d(3, 1, 3)
+  # self.reg = nn.Conv2d(3, 3, 3)
+  # self.cls = nn.Linear(1,2)
+
+  init_cfg = dict(type='Constant',
+                  layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(type='Constant', name='reg', val=3, bias=4))
+  # self.feat and self.cls 将被初始化为 dict(type='Constant', val=1, bias=2)
+  # 叫 'reg' 的模块将被初始化为 dict(type='Constant', val=3, bias=4)
+  ```
+
+- 如果 init_cfg 中的 `layer` 为 None，则只会初始化 override 中有 name 的⼦模块，⽽ override 中的 type 和其他参数可以省略。
+
+  ```python
+  # layers：
+  # self.feat = nn.Conv1d(3, 1, 3)
+  # self.reg = nn.Conv2d(3, 3, 3)
+  # self.cls = nn.Linear(1,2)
+
+  init_cfg = dict(type='Constant', val=1, bias=2, 	override=dict(name='reg'))
+
+  # self.feat and self.cls 将被 Pytorch 初始化
+  # 叫 'reg' 的模块将被 dict(type='Constant', val=1, bias=2) 初始化
+  ```
+
+- 如果我们不定义 `layer` 或 `override` 键，它不会初始化任何东西。
+
+- 无效的使用
+
+  ```python
+  # override 没有 name 键的话是无效的
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+              	override=dict(type='Constant', val=3, bias=4))
+
+  # override 有 name 键和其他参数但是没有 type 键也是无效的
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(name='reg', val=3, bias=4))
+  ```
+
+3. 使⽤预训练模型初始化模型
+
+   ```python
+   init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+   ```
+
+更多细节可以参考 [MMCV](https://mmcv.readthedocs.io/en/latest/cnn.html#weight-initialization) 的文档和 MMCV [PR #780](https://github.com/open-mmlab/mmcv/pull/780)
diff --git a/docs/zh_cn/tutorials/onnx2tensorrt.md b/docs/zh_cn/tutorials/onnx2tensorrt.md
new file mode 100755
index 0000000..678a131
--- /dev/null
+++ b/docs/zh_cn/tutorials/onnx2tensorrt.md
@@ -0,0 +1,106 @@
+# 教程 9: ONNX 到 TensorRT 的模型转换（实验性支持）
+
+> ## [尝试使用新的 MMDeploy 来部署你的模型](https://mmdeploy.readthedocs.io/)
+
+<!-- TOC -->
+
+- [教程 9: ONNX 到 TensorRT 的模型转换（实验性支持）](#%E6%95%99%E7%A8%8B-9-onnx-%E5%88%B0-tensorrt-%E7%9A%84%E6%A8%A1%E5%9E%8B%E8%BD%AC%E6%8D%A2%E5%AE%9E%E9%AA%8C%E6%80%A7%E6%94%AF%E6%8C%81)
+  - [如何将模型从 ONNX 转换为 TensorRT](#%E5%A6%82%E4%BD%95%E5%B0%86%E6%A8%A1%E5%9E%8B%E4%BB%8E-onnx-%E8%BD%AC%E6%8D%A2%E4%B8%BA-tensorrt)
+    - [先决条件](#%E5%85%88%E5%86%B3%E6%9D%A1%E4%BB%B6)
+    - [用法](#%E7%94%A8%E6%B3%95)
+  - [如何评估导出的模型](#%E5%A6%82%E4%BD%95%E8%AF%84%E4%BC%B0%E5%AF%BC%E5%87%BA%E7%9A%84%E6%A8%A1%E5%9E%8B)
+  - [支持转换为 TensorRT 的模型列表](#%E6%94%AF%E6%8C%81%E8%BD%AC%E6%8D%A2%E4%B8%BA-tensorrt-%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8)
+  - [提醒](#%E6%8F%90%E9%86%92)
+  - [常见问题](#%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98)
+
+<!-- TOC -->
+
+## 如何将模型从 ONNX 转换为 TensorRT
+
+### 先决条件
+
+1. 请参考 [get_started.md](https://mmdetection.readthedocs.io/en/latest/get_started.html) 从源码安装 MMCV 和 MMDetection。
+2. 请参考 [ONNXRuntime in mmcv](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_op.html) 和 [TensorRT plugin in mmcv](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/tensorrt_plugin.md/) 安装支持 ONNXRuntime 自定义操作和 TensorRT 插件的 `mmcv-full`。
+3. 使用工具 [pytorch2onnx](https://mmdetection.readthedocs.io/en/latest/tutorials/pytorch2onnx.html) 将模型从 PyTorch 转换为 ONNX。
+
+### 用法
+
+```bash
+python tools/deployment/onnx2tensorrt.py \
+    ${CONFIG} \
+    ${MODEL} \
+    --trt-file ${TRT_FILE} \
+    --input-img ${INPUT_IMAGE_PATH} \
+    --shape ${INPUT_IMAGE_SHAPE} \
+    --min-shape ${MIN_IMAGE_SHAPE} \
+    --max-shape ${MAX_IMAGE_SHAPE} \
+    --workspace-size {WORKSPACE_SIZE} \
+    --show \
+    --verify \
+```
+
+所有参数的说明：
+
+- `config`: 模型配置文件的路径。
+- `model`: ONNX 模型文件的路径。
+- `--trt-file`: 输出 TensorRT 引擎文件的路径。如果未指定，它将被设置为 `tmp.trt`。
+- `--input-img`: 用于追踪和转换的输入图像的路径。默认情况下，它将设置为 `demo/demo.jpg`。
+- `--shape`: 模型输入的高度和宽度。如果未指定，它将设置为 `400 600`。
+- `--min-shape`: 模型输入的最小高度和宽度。如果未指定，它将被设置为与 `--shape` 相同。
+- `--max-shape`: 模型输入的最大高度和宽度。如果未指定，它将被设置为与 `--shape` 相同。
+- `--workspace-size`: 构建 TensorRT 引擎所需的 GPU 工作空间大小（以 GiB 为单位）。如果未指定，它将设置为 `1` GiB。
+- `--show`: 确定是否显示模型的输出。如果未指定，它将设置为 `False`。
+- `--verify`: 确定是否在 ONNXRuntime 和 TensorRT 之间验证模型的正确性。如果未指定，它将设置为 `False`。
+- `--verbose`: 确定是否打印日志消息。它对调试很有用。如果未指定，它将设置为 `False`。
+
+例子:
+
+```bash
+python tools/deployment/onnx2tensorrt.py \
+    configs/retinanet/retinanet_r50_fpn_1x_coco.py \
+    checkpoints/retinanet_r50_fpn_1x_coco.onnx \
+    --trt-file checkpoints/retinanet_r50_fpn_1x_coco.trt \
+    --input-img demo/demo.jpg \
+    --shape 400 600 \
+    --show \
+    --verify \
+```
+
+## 如何评估导出的模型
+
+我们准备了一个工具 `tools/deplopyment/test.py` 来评估 TensorRT 模型。
+
+请参阅以下链接以获取更多信息。
+
+- [如何评估导出的模型](pytorch2onnx.md#how-to-evaluate-the-exported-models)
+- [结果和模型](pytorch2onnx.md#results-and-models)
+
+## 支持转换为 TensorRT 的模型列表
+
+下表列出了确定可转换为 TensorRT 的模型。
+
+|       Model        |                              Config                              | Dynamic Shape | Batch Inference | Note |
+| :----------------: | :--------------------------------------------------------------: | :-----------: | :-------------: | :--: |
+|        SSD         |                   `configs/ssd/ssd300_coco.py`                   |       Y       |        Y        |      |
+|        FSAF        |              `configs/fsaf/fsaf_r50_fpn_1x_coco.py`              |       Y       |        Y        |      |
+|        FCOS        |         `configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py`         |       Y       |        Y        |      |
+|       YOLOv3       |        `configs/yolo/yolov3_d53_mstrain-608_273e_coco.py`        |       Y       |        Y        |      |
+|     RetinaNet      |         `configs/retinanet/retinanet_r50_fpn_1x_coco.py`         |       Y       |        Y        |      |
+|    Faster R-CNN    |       `configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py`       |       Y       |        Y        |      |
+|   Cascade R-CNN    |      `configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py`      |       Y       |        Y        |      |
+|     Mask R-CNN     |         `configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py`         |       Y       |        Y        |      |
+| Cascade Mask R-CNN |   `configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py`    |       Y       |        Y        |      |
+|     PointRend      | `configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py` |       Y       |        Y        |      |
+
+注意:
+
+- *以上所有模型通过 Pytorch==1.6.0, onnx==1.7.0 与 TensorRT-7.2.1.6.Ubuntu-16.04.x86_64-gnu.cuda-10.2.cudnn8.0 测试*
+
+## 提醒
+
+- 如果您在上面列出的模型中遇到任何问题，请创建 issue，我们会尽快处理。对于未包含在列表中的模型，由于资源有限，我们可能无法在此提供太多帮助。请尝试深入挖掘并自行调试。
+- 由于此功能是实验性的，并且可能会快速更改，因此请始终尝试使用最新的 `mmcv` 和 `mmdetecion`。
+
+## 常见问题
+
+- 空
diff --git a/docs/zh_cn/tutorials/pytorch2onnx.md b/docs/zh_cn/tutorials/pytorch2onnx.md
new file mode 100755
index 0000000..93a647e
--- /dev/null
+++ b/docs/zh_cn/tutorials/pytorch2onnx.md
@@ -0,0 +1,3 @@
+# 教程 8: Pytorch 到 ONNX 的模型转换（实验性支持）
+
+> ## [尝试使用新的 MMDeploy 來部署你的模型](https://mmdeploy.readthedocs.io/)
diff --git a/docs/zh_cn/useful_tools.md b/docs/zh_cn/useful_tools.md
new file mode 100755
index 0000000..922164c
--- /dev/null
+++ b/docs/zh_cn/useful_tools.md
@@ -0,0 +1 @@
+## 日志分析
diff --git a/mmdet/.mim/configs b/mmdet/.mim/configs
new file mode 120000
index 0000000..5992d10
--- /dev/null
+++ b/mmdet/.mim/configs
@@ -0,0 +1 @@
+../../configs
\ No newline at end of file
diff --git a/mmdet/.mim/demo b/mmdet/.mim/demo
new file mode 120000
index 0000000..bf71256
--- /dev/null
+++ b/mmdet/.mim/demo
@@ -0,0 +1 @@
+../../demo
\ No newline at end of file
diff --git a/mmdet/.mim/model-index.yml b/mmdet/.mim/model-index.yml
new file mode 120000
index 0000000..a18c0b3
--- /dev/null
+++ b/mmdet/.mim/model-index.yml
@@ -0,0 +1 @@
+../../model-index.yml
\ No newline at end of file
diff --git a/mmdet/.mim/tools b/mmdet/.mim/tools
new file mode 120000
index 0000000..31941e9
--- /dev/null
+++ b/mmdet/.mim/tools
@@ -0,0 +1 @@
+../../tools
\ No newline at end of file
diff --git a/mmdet/__init__.py b/mmdet/__init__.py
new file mode 100755
index 0000000..4df16af
--- /dev/null
+++ b/mmdet/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.3.17'
+mmcv_maximum_version = '1.8.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/mmdet/apis/__init__.py b/mmdet/apis/__init__.py
new file mode 100755
index 0000000..a865e94
--- /dev/null
+++ b/mmdet/apis/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import (async_inference_detector, inference_detector,
+                        init_detector, show_result_pyplot)
+from .test import multi_gpu_test, single_gpu_test
+from .train import (get_root_logger, init_random_seed, set_random_seed,
+                    train_detector)
+
+__all__ = [
+    'get_root_logger', 'set_random_seed', 'train_detector', 'init_detector',
+    'async_inference_detector', 'inference_detector', 'show_result_pyplot',
+    'multi_gpu_test', 'single_gpu_test', 'init_random_seed'
+]
diff --git a/mmdet/apis/inference.py b/mmdet/apis/inference.py
new file mode 100755
index 0000000..f0858a7
--- /dev/null
+++ b/mmdet/apis/inference.py
@@ -0,0 +1,257 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from pathlib import Path
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.ops import RoIPool
+from mmcv.parallel import collate, scatter
+from mmcv.runner import load_checkpoint
+
+from mmdet.core import get_classes
+from mmdet.datasets import replace_ImageToTensor
+from mmdet.datasets.pipelines import Compose
+from mmdet.models import build_detector
+
+
+def init_detector(config, checkpoint=None, device='cuda:0', cfg_options=None):
+    """Initialize a detector from config file.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmcv.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        cfg_options (dict): Options to override some settings in the used
+            config.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, (str, Path)):
+        config = mmcv.Config.fromfile(config)
+    elif not isinstance(config, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    if 'pretrained' in config.model:
+        config.model.pretrained = None
+    elif 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
+    config.model.train_cfg = None
+    model = build_detector(config.model, test_cfg=config.get('test_cfg'))
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        if 'CLASSES' in checkpoint.get('meta', {}):
+            model.CLASSES = checkpoint['meta']['CLASSES']
+        else:
+            warnings.simplefilter('once')
+            warnings.warn('Class names are not saved in the checkpoint\'s '
+                          'meta data, use COCO classes by default.')
+            model.CLASSES = get_classes('coco')
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+
+    if device == 'npu':
+        from mmcv.device.npu import NPUDataParallel
+        model = NPUDataParallel(model)
+        model.cfg = config
+
+    return model
+
+
+class LoadImage:
+    """Deprecated.
+
+    A simple pipeline to load image.
+    """
+
+    def __call__(self, results):
+        """Call function to load images into results.
+
+        Args:
+            results (dict): A result dict contains the file name
+                of the image to be read.
+        Returns:
+            dict: ``results`` will be returned containing loaded image.
+        """
+        warnings.simplefilter('once')
+        warnings.warn('`LoadImage` is deprecated and will be removed in '
+                      'future releases. You may use `LoadImageFromWebcam` '
+                      'from `mmdet.datasets.pipelines.` instead.')
+        if isinstance(results['img'], str):
+            results['filename'] = results['img']
+            results['ori_filename'] = results['img']
+        else:
+            results['filename'] = None
+            results['ori_filename'] = None
+        img = mmcv.imread(results['img'])
+        results['img'] = img
+        results['img_fields'] = ['img']
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        return results
+
+
+def inference_detector(model, imgs):
+    """Inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        imgs (str/ndarray or list[str/ndarray] or tuple[str/ndarray]):
+           Either image files or loaded images.
+
+    Returns:
+        If imgs is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+
+    if isinstance(imgs, (list, tuple)):
+        is_batch = True
+    else:
+        imgs = [imgs]
+        is_batch = False
+
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+
+    cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    test_pipeline = Compose(cfg.data.test.pipeline)
+
+    datas = []
+    for img in imgs:
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # directly add img
+            data = dict(img=img)
+        else:
+            # add information into dict
+            data = dict(img_info=dict(filename=img), img_prefix=None)
+        # build the data pipeline
+        data = test_pipeline(data)
+        datas.append(data)
+
+    data = collate(datas, samples_per_gpu=len(imgs))
+    # just get the actual data from DataContainer
+    data['img_metas'] = [img_metas.data[0] for img_metas in data['img_metas']]
+    data['img'] = [img.data[0] for img in data['img']]
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device])[0]
+    else:
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    # forward the model
+    with torch.no_grad():
+        results = model(return_loss=False, rescale=True, **data)
+
+    if not is_batch:
+        return results[0]
+    else:
+        return results
+
+
+async def async_inference_detector(model, imgs):
+    """Async inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | ndarray): Either image files or loaded images.
+
+    Returns:
+        Awaitable detection results.
+    """
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+
+    cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    test_pipeline = Compose(cfg.data.test.pipeline)
+
+    datas = []
+    for img in imgs:
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # directly add img
+            data = dict(img=img)
+        else:
+            # add information into dict
+            data = dict(img_info=dict(filename=img), img_prefix=None)
+        # build the data pipeline
+        data = test_pipeline(data)
+        datas.append(data)
+
+    data = collate(datas, samples_per_gpu=len(imgs))
+    # just get the actual data from DataContainer
+    data['img_metas'] = [img_metas.data[0] for img_metas in data['img_metas']]
+    data['img'] = [img.data[0] for img in data['img']]
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device])[0]
+    else:
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    # We don't restore `torch.is_grad_enabled()` value during concurrent
+    # inference since execution can overlap
+    torch.set_grad_enabled(False)
+    results = await model.aforward_test(rescale=True, **data)
+    return results
+
+
+def show_result_pyplot(model,
+                       img,
+                       result,
+                       score_thr=0.3,
+                       title='result',
+                       wait_time=0,
+                       palette=None,
+                       out_file=None):
+    """Visualize the detection results on the image.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str or np.ndarray): Image filename or loaded image.
+        result (tuple[list] or list): The detection result, can be either
+            (bbox, segm) or just bbox.
+        score_thr (float): The threshold to visualize the bboxes and masks.
+        title (str): Title of the pyplot figure.
+        wait_time (float): Value of waitKey param. Default: 0.
+        palette (str or tuple(int) or :obj:`Color`): Color.
+            The tuple of color should be in BGR order.
+        out_file (str or None): The path to write the image.
+            Default: None.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+    model.show_result(
+        img,
+        result,
+        score_thr=score_thr,
+        show=True,
+        wait_time=wait_time,
+        win_name=title,
+        bbox_color=palette,
+        text_color=(200, 200, 200),
+        mask_color=palette,
+        out_file=out_file)
diff --git a/mmdet/apis/test.py b/mmdet/apis/test.py
new file mode 100755
index 0000000..973d362
--- /dev/null
+++ b/mmdet/apis/test.py
@@ -0,0 +1,209 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+def single_gpu_test(model,
+                    data_loader,
+                    show=False,
+                    out_dir=None,
+                    show_score_thr=0.3):
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    PALETTE = getattr(dataset, 'PALETTE', None)
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+
+        batch_size = len(result)
+        if show or out_dir:
+            if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
+                img_tensor = data['img'][0]
+            else:
+                img_tensor = data['img'][0].data[0]
+            img_metas = data['img_metas'][0].data[0]
+            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+            assert len(imgs) == len(img_metas)
+
+            for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
+                h, w, _ = img_meta['img_shape']
+                img_show = img[:h, :w, :]
+
+                ori_h, ori_w = img_meta['ori_shape'][:-1]
+                img_show = mmcv.imresize(img_show, (ori_w, ori_h))
+
+                if out_dir:
+                    out_file = osp.join(out_dir, img_meta['ori_filename'])
+                else:
+                    out_file = None
+
+                model.module.show_result(
+                    img_show,
+                    result[i],
+                    bbox_color=PALETTE,
+                    text_color=PALETTE,
+                    mask_color=PALETTE,
+                    show=show,
+                    out_file=out_file,
+                    score_thr=show_score_thr)
+
+        # encode mask results
+        if isinstance(result[0], tuple):
+            result = [(bbox_results, encode_mask_results(mask_results))
+                      for bbox_results, mask_results in result]
+        # This logic is only used in panoptic segmentation test.
+        elif isinstance(result[0], dict) and 'ins_results' in result[0]:
+            for j in range(len(result)):
+                bbox_results, mask_results = result[j]['ins_results']
+                result[j]['ins_results'] = (bbox_results,
+                                            encode_mask_results(mask_results))
+
+        results.extend(result)
+
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+            # encode mask results
+            if isinstance(result[0], tuple):
+                result = [(bbox_results, encode_mask_results(mask_results))
+                          for bbox_results, mask_results in result]
+            # This logic is only used in panoptic segmentation test.
+            elif isinstance(result[0], dict) and 'ins_results' in result[0]:
+                for j in range(len(result)):
+                    bbox_results, mask_results = result[j]['ins_results']
+                    result[j]['ins_results'] = (
+                        bbox_results, encode_mask_results(mask_results))
+
+        results.extend(result)
+
+        if rank == 0:
+            batch_size = len(result)
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_list.append(
+                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
diff --git a/mmdet/apis/train.py b/mmdet/apis/train.py
new file mode 100755
index 0000000..4795385
--- /dev/null
+++ b/mmdet/apis/train.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook, build_runner,
+                         get_dist_info)
+
+from mmdet.core import DistEvalHook, EvalHook, build_optimizer
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.utils import (build_ddp, build_dp, compat_cfg,
+                         find_latest_checkpoint, get_root_logger)
+
+
+def init_random_seed(seed=None, device='cuda'):
+    """Initialize random seed.
+
+    If the seed is not set, the seed will be automatically randomized,
+    and then broadcast to all processes to prevent some potential bugs.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is not None:
+        return seed
+
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    rank, world_size = get_dist_info()
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def auto_scale_lr(cfg, distributed, logger):
+    """Automatically scaling LR according to GPU number and sample per GPU.
+
+    Args:
+        cfg (config): Training config.
+        distributed (bool): Using distributed or not.
+        logger (logging.Logger): Logger.
+    """
+    # Get flag from config
+    if ('auto_scale_lr' not in cfg) or \
+            (not cfg.auto_scale_lr.get('enable', False)):
+        logger.info('Automatic scaling of learning rate (LR)'
+                    ' has been disabled.')
+        return
+
+    # Get base batch size from config
+    base_batch_size = cfg.auto_scale_lr.get('base_batch_size', None)
+    if base_batch_size is None:
+        return
+
+    # Get gpu number
+    if distributed:
+        _, world_size = get_dist_info()
+        num_gpus = len(range(world_size))
+    else:
+        num_gpus = len(cfg.gpu_ids)
+
+    # calculate the batch size
+    samples_per_gpu = cfg.data.train_dataloader.samples_per_gpu
+    batch_size = num_gpus * samples_per_gpu
+    logger.info(f'Training with {num_gpus} GPU(s) with {samples_per_gpu} '
+                f'samples per GPU. The total batch size is {batch_size}.')
+
+    if batch_size != base_batch_size:
+        # scale LR with
+        # [linear scaling rule](https://arxiv.org/abs/1706.02677)
+        scaled_lr = (batch_size / base_batch_size) * cfg.optimizer.lr
+        logger.info('LR has been automatically scaled '
+                    f'from {cfg.optimizer.lr} to {scaled_lr}')
+        cfg.optimizer.lr = scaled_lr
+    else:
+        logger.info('The batch size match the '
+                    f'base batch size: {base_batch_size}, '
+                    f'will not scaling the LR ({cfg.optimizer.lr}).')
+
+
+def train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   meta=None):
+
+    cfg = compat_cfg(cfg)
+    logger = get_root_logger(log_level=cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+
+    runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[
+        'type']
+
+    train_dataloader_default_args = dict(
+        samples_per_gpu=2,
+        workers_per_gpu=2,
+        # `num_gpus` will be ignored if distributed
+        num_gpus=len(cfg.gpu_ids),
+        dist=distributed,
+        seed=cfg.seed,
+        runner_type=runner_type,
+        persistent_workers=False)
+
+    train_loader_cfg = {
+        **train_dataloader_default_args,
+        **cfg.data.get('train_dataloader', {})
+    }
+
+    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = build_ddp(
+            model,
+            cfg.device,
+            device_ids=[int(os.environ['LOCAL_RANK'])],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+    else:
+        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
+
+    # build optimizer
+    auto_scale_lr(cfg, distributed, logger)
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    runner = build_runner(
+        cfg.runner,
+        default_args=dict(
+            model=model,
+            optimizer=optimizer,
+            work_dir=cfg.work_dir,
+            logger=logger,
+            meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is None and cfg.get('device', None) == 'npu':
+        fp16_cfg = dict(loss_scale='dynamic')
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(
+        cfg.lr_config,
+        optimizer_config,
+        cfg.checkpoint_config,
+        cfg.log_config,
+        cfg.get('momentum_config', None),
+        custom_hooks_config=cfg.get('custom_hooks', None))
+
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        val_dataloader_default_args = dict(
+            samples_per_gpu=1,
+            workers_per_gpu=2,
+            dist=distributed,
+            shuffle=False,
+            persistent_workers=False)
+
+        val_dataloader_args = {
+            **val_dataloader_default_args,
+            **cfg.data.get('val_dataloader', {})
+        }
+        # Support batch_size > 1 in validation
+
+        if val_dataloader_args['samples_per_gpu'] > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(val_dataset, **val_dataloader_args)
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_hook = DistEvalHook if distributed else EvalHook
+        # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the
+        # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'.
+        runner.register_hook(
+            eval_hook(val_dataloader, start=0, **eval_cfg), priority='LOW')
+
+    resume_from = None
+    if cfg.resume_from is None and cfg.get('auto_resume'):
+        resume_from = find_latest_checkpoint(cfg.work_dir)
+    if resume_from is not None:
+        cfg.resume_from = resume_from
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
diff --git a/mmdet/core/__init__.py b/mmdet/core/__init__.py
new file mode 100755
index 0000000..2a62038
--- /dev/null
+++ b/mmdet/core/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
+from .data_structures import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .hook import *  # noqa: F401, F403
+from .mask import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
diff --git a/mmdet/core/anchor/__init__.py b/mmdet/core/anchor/__init__.py
new file mode 100755
index 0000000..fcc7e4a
--- /dev/null
+++ b/mmdet/core/anchor/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator,
+                               YOLOAnchorGenerator)
+from .builder import (ANCHOR_GENERATORS, PRIOR_GENERATORS,
+                      build_anchor_generator, build_prior_generator)
+from .point_generator import MlvlPointGenerator, PointGenerator
+from .utils import anchor_inside_flags, calc_region, images_to_levels
+
+__all__ = [
+    'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags',
+    'PointGenerator', 'images_to_levels', 'calc_region',
+    'build_anchor_generator', 'ANCHOR_GENERATORS', 'YOLOAnchorGenerator',
+    'build_prior_generator', 'PRIOR_GENERATORS', 'MlvlPointGenerator'
+]
diff --git a/mmdet/core/anchor/anchor_generator.py b/mmdet/core/anchor/anchor_generator.py
new file mode 100755
index 0000000..20886fb
--- /dev/null
+++ b/mmdet/core/anchor/anchor_generator.py
@@ -0,0 +1,866 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class AnchorGenerator:
+    """Standard anchor generator for 2D anchor-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int] | None): The basic sizes
+            of anchors in multiple levels.
+            If None is given, strides will be used as base_sizes.
+            (If strides are non square, the shortest stride is taken.)
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. If a list of tuple of
+            float is given, they will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0 in V2.0.
+
+    Examples:
+        >>> from mmdet.core import AnchorGenerator
+        >>> self = AnchorGenerator([16], [1.], [1.], [9])
+        >>> all_anchors = self.grid_priors([(2, 2)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]])]
+        >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+        >>> all_anchors = self.grid_priors([(2, 2), (1, 1)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]]), \
+        tensor([[-9., -9., 9., 9.]])]
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 scales=None,
+                 base_sizes=None,
+                 scale_major=True,
+                 octave_base_scale=None,
+                 scales_per_octave=None,
+                 centers=None,
+                 center_offset=0.):
+        # check center and center_offset
+        if center_offset != 0:
+            assert centers is None, 'center cannot be set when center_offset' \
+                                    f'!=0, {centers} is given.'
+        if not (0 <= center_offset <= 1):
+            raise ValueError('center_offset should be in range [0, 1], '
+                             f'{center_offset} is given.')
+        if centers is not None:
+            assert len(centers) == len(strides), \
+                'The number of strides should be the same as centers, got ' \
+                f'{strides} and {centers}'
+
+        # calculate base sizes of anchors
+        self.strides = [_pair(stride) for stride in strides]
+        self.base_sizes = [min(stride) for stride in self.strides
+                           ] if base_sizes is None else base_sizes
+        assert len(self.base_sizes) == len(self.strides), \
+            'The number of strides should be the same as base sizes, got ' \
+            f'{self.strides} and {self.base_sizes}'
+
+        # calculate scales of anchors
+        assert ((octave_base_scale is not None
+                 and scales_per_octave is not None) ^ (scales is not None)), \
+            'scales and octave_base_scale with scales_per_octave cannot' \
+            ' be set at the same time'
+        if scales is not None:
+            self.scales = torch.Tensor(scales)
+        elif octave_base_scale is not None and scales_per_octave is not None:
+            octave_scales = np.array(
+                [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+            scales = octave_scales * octave_base_scale
+            self.scales = torch.Tensor(scales)
+        else:
+            raise ValueError('Either scales or octave_base_scale with '
+                             'scales_per_octave should be set')
+
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.centers = centers
+        self.center_offset = center_offset
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_base_anchors(self):
+        """list[int]: total number of base anchors in a feature grid"""
+        return self.num_base_priors
+
+    @property
+    def num_base_priors(self):
+        """list[int]: The number of priors (anchors) at a point
+        on the feature grid"""
+        return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(
+                    base_size,
+                    scales=self.scales,
+                    ratios=self.ratios,
+                    center=center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_size,
+                                      scales,
+                                      ratios,
+                                      center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * w
+            y_center = self.center_offset * h
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws,
+            y_center + 0.5 * hs
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1)
+
+        return base_anchors
+
+    def _meshgrid(self, x, y, row_major=True):
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool, optional): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        # use shape instead of len to keep tracing while exporting to onnx
+        xx = x.repeat(y.shape[0])
+        yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_priors(self, featmap_sizes, dtype=torch.float32, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            dtype (:obj:`torch.dtype`): Dtype of priors.
+                Default: torch.float32.
+            device (str): The device where the anchors will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_priors(
+                featmap_sizes[i], level_idx=i, dtype=dtype, device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_priors(self,
+                                 featmap_size,
+                                 level_idx,
+                                 dtype=torch.float32,
+                                 device='cuda'):
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps.
+            level_idx (int): The index of corresponding feature map level.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str, optional): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        base_anchors = self.base_anchors[level_idx].to(device).to(dtype)
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w
+        shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def sparse_priors(self,
+                      prior_idxs,
+                      featmap_size,
+                      level_idx,
+                      dtype=torch.float32,
+                      device='cuda'):
+        """Generate sparse anchors according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (h, w).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (obj:`torch.device`): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 4), N should be equal to
+                the length of ``prior_idxs``.
+        """
+
+        height, width = featmap_size
+        num_base_anchors = self.num_base_anchors[level_idx]
+        base_anchor_id = prior_idxs % num_base_anchors
+        x = (prior_idxs //
+             num_base_anchors) % width * self.strides[level_idx][0]
+        y = (prior_idxs // width //
+             num_base_anchors) % height * self.strides[level_idx][1]
+        priors = torch.stack([x, y, x, y], 1).to(dtype).to(device) + \
+            self.base_anchors[level_idx][base_anchor_id, :].to(device)
+
+        return priors
+
+    def grid_anchors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        warnings.warn('``grid_anchors`` would be deprecated soon. '
+                      'Please use ``grid_priors`` ')
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                self.base_anchors[i].to(device),
+                featmap_sizes[i],
+                self.strides[i],
+                device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self,
+                                  base_anchors,
+                                  featmap_size,
+                                  stride=(16, 16),
+                                  device='cuda'):
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            base_anchors (torch.Tensor): The base anchors of a feature grid.
+            featmap_size (tuple[int]): Size of the feature maps.
+            stride (tuple[int], optional): Stride of the feature map in order
+                (w, h). Defaults to (16, 16).
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        warnings.warn(
+            '``single_level_grid_anchors`` would be deprecated soon. '
+            'Please use ``single_level_grid_priors`` ')
+
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride[0]
+        shift_y = torch.arange(0, feat_h, device=device) * stride[1]
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self, featmap_sizes, pad_shape, device='cuda'):
+        """Generate valid flags of anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels.
+            pad_shape (tuple): The padded shape of the image.
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of anchors in multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  self.num_base_anchors[i],
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size,
+                                 valid_size,
+                                 num_base_anchors,
+                                 device='cuda'):
+        """Generate the valid flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(valid.size(0),
+                                      num_base_anchors).contiguous().view(-1)
+        return valid
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}octave_base_scale='
+        repr_str += f'{self.octave_base_scale},\n'
+        repr_str += f'{indent_str}scales_per_octave='
+        repr_str += f'{self.scales_per_octave},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels}\n'
+        repr_str += f'{indent_str}centers={self.centers},\n'
+        repr_str += f'{indent_str}center_offset={self.center_offset})'
+        return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+    """Anchor generator for SSD.
+
+    Args:
+        strides (list[int]  | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        min_sizes (list[float]): The list of minimum anchor sizes on each
+            level.
+        max_sizes (list[float]): The list of maximum anchor sizes on each
+            level.
+        basesize_ratio_range (tuple(float)): Ratio range of anchors. Being
+            used when not setting min_sizes and max_sizes.
+        input_size (int): Size of feature map, 300 for SSD300, 512 for
+            SSD512. Being used when not setting min_sizes and max_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. It is always set to be False in SSD.
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 min_sizes=None,
+                 max_sizes=None,
+                 basesize_ratio_range=(0.15, 0.9),
+                 input_size=300,
+                 scale_major=True):
+        assert len(strides) == len(ratios)
+        assert not (min_sizes is None) ^ (max_sizes is None)
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+
+        if min_sizes is None and max_sizes is None:
+            # use hard code to generate SSD anchors
+            self.input_size = input_size
+            assert mmcv.is_tuple_of(basesize_ratio_range, float)
+            self.basesize_ratio_range = basesize_ratio_range
+            # calculate anchor ratios and sizes
+            min_ratio, max_ratio = basesize_ratio_range
+            min_ratio = int(min_ratio * 100)
+            max_ratio = int(max_ratio * 100)
+            step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+            min_sizes = []
+            max_sizes = []
+            for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+                min_sizes.append(int(self.input_size * ratio / 100))
+                max_sizes.append(int(self.input_size * (ratio + step) / 100))
+            if self.input_size == 300:
+                if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                    min_sizes.insert(0, int(self.input_size * 10 / 100))
+                    max_sizes.insert(0, int(self.input_size * 20 / 100))
+                else:
+                    raise ValueError(
+                        'basesize_ratio_range[0] should be either 0.15'
+                        'or 0.2 when input_size is 300, got '
+                        f'{basesize_ratio_range[0]}.')
+            elif self.input_size == 512:
+                if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                    min_sizes.insert(0, int(self.input_size * 4 / 100))
+                    max_sizes.insert(0, int(self.input_size * 10 / 100))
+                elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                else:
+                    raise ValueError(
+                        'When not setting min_sizes and max_sizes,'
+                        'basesize_ratio_range[0] should be either 0.1'
+                        'or 0.15 when input_size is 512, got'
+                        f' {basesize_ratio_range[0]}.')
+            else:
+                raise ValueError(
+                    'Only support 300 or 512 in SSDAnchorGenerator when '
+                    'not setting min_sizes and max_sizes, '
+                    f'got {self.input_size}.')
+
+        assert len(min_sizes) == len(max_sizes) == len(strides)
+
+        anchor_ratios = []
+        anchor_scales = []
+        for k in range(len(self.strides)):
+            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+            anchor_ratio = [1.]
+            for r in ratios[k]:
+                anchor_ratio += [1 / r, r]  # 4 or 6 ratio
+            anchor_ratios.append(torch.Tensor(anchor_ratio))
+            anchor_scales.append(torch.Tensor(scales))
+
+        self.base_sizes = min_sizes
+        self.scales = anchor_scales
+        self.ratios = anchor_ratios
+        self.scale_major = scale_major
+        self.center_offset = 0
+        self.base_anchors = self.gen_base_anchors()
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            base_anchors = self.gen_single_level_base_anchors(
+                base_size,
+                scales=self.scales[i],
+                ratios=self.ratios[i],
+                center=self.centers[i])
+            indices = list(range(len(self.ratios[i])))
+            indices.insert(1, len(indices))
+            base_anchors = torch.index_select(base_anchors, 0,
+                                              torch.LongTensor(indices))
+            multi_level_base_anchors.append(base_anchors)
+        return multi_level_base_anchors
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}input_size={self.input_size},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}basesize_ratio_range='
+        repr_str += f'{self.basesize_ratio_range})'
+        return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    Note:
+        Difference to the V2.0 anchor generator:
+
+        1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+        2. The width/height are minused by 1 when calculating the anchors' \
+            centers and corners to meet the V1.x coordinate system.
+        3. The anchors' corners are quantized.
+
+    Args:
+        strides (list[int] | list[tuple[int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+            If None is given, strides will be used to generate base_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. It a list of float
+            is given, this list will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0.5 in V2.0 but it should be 0.5
+            in v1.x models.
+
+    Examples:
+        >>> from mmdet.core import LegacyAnchorGenerator
+        >>> self = LegacyAnchorGenerator(
+        >>>     [16], [1.], [1.], [9], center_offset=0.5)
+        >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+        >>> print(all_anchors)
+        [tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])]
+    """
+
+    def gen_single_level_base_anchors(self,
+                                      base_size,
+                                      scales,
+                                      ratios,
+                                      center=None):
+        """Generate base anchors of a single level.
+
+        Note:
+            The width/height of anchors are minused by 1 when calculating \
+                the centers and corners to meet the V1.x coordinate system.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height.
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature map.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * (w - 1)
+            y_center = self.center_offset * (h - 1)
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1),
+            x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1)
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+        return base_anchors
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+    can be found in `LegacyAnchorGenerator`.
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 basesize_ratio_range,
+                 input_size=300,
+                 scale_major=True):
+        super(LegacySSDAnchorGenerator, self).__init__(
+            strides=strides,
+            ratios=ratios,
+            basesize_ratio_range=basesize_ratio_range,
+            input_size=input_size,
+            scale_major=scale_major)
+        self.centers = [((stride - 1) / 2., (stride - 1) / 2.)
+                        for stride in strides]
+        self.base_anchors = self.gen_base_anchors()
+
+
+@PRIOR_GENERATORS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+    """Anchor generator for YOLO.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        base_sizes (list[list[tuple[int, int]]]): The basic sizes
+            of anchors in multiple levels.
+    """
+
+    def __init__(self, strides, base_sizes):
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.base_sizes = []
+        num_anchor_per_level = len(base_sizes[0])
+        for base_sizes_per_level in base_sizes:
+            assert num_anchor_per_level == len(base_sizes_per_level)
+            self.base_sizes.append(
+                [_pair(base_size) for base_size in base_sizes_per_level])
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.base_sizes)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_sizes_per_level in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(base_sizes_per_level,
+                                                   center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self, base_sizes_per_level, center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_sizes_per_level (list[tuple[int, int]]): Basic sizes of
+                anchors.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        x_center, y_center = center
+        base_anchors = []
+        for base_size in base_sizes_per_level:
+            w, h = base_size
+
+            # use float anchor and the anchor's center is aligned with the
+            # pixel center
+            base_anchor = torch.Tensor([
+                x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w,
+                y_center + 0.5 * h
+            ])
+            base_anchors.append(base_anchor)
+        base_anchors = torch.stack(base_anchors, dim=0)
+
+        return base_anchors
+
+    def responsible_flags(self, featmap_sizes, gt_bboxes, device='cuda'):
+        """Generate responsible anchor flags of grid cells in multiple scales.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in multiple
+                feature levels.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): responsible flags of anchors in multiple level
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_responsible_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            flags = self.single_level_responsible_flags(
+                featmap_sizes[i],
+                gt_bboxes,
+                anchor_stride,
+                self.num_base_anchors[i],
+                device=device)
+            multi_level_responsible_flags.append(flags)
+        return multi_level_responsible_flags
+
+    def single_level_responsible_flags(self,
+                                       featmap_size,
+                                       gt_bboxes,
+                                       stride,
+                                       num_base_anchors,
+                                       device='cuda'):
+        """Generate the responsible flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            stride (tuple(int)): stride of current level
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+        gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+        gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / stride[0]).long()
+        gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / stride[1]).long()
+
+        # row major indexing
+        gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
+
+        responsible_grid = torch.zeros(
+            feat_h * feat_w, dtype=torch.uint8, device=device)
+        responsible_grid[gt_bboxes_grid_idx] = 1
+
+        responsible_grid = responsible_grid[:, None].expand(
+            responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
+        return responsible_grid
diff --git a/mmdet/core/anchor/builder.py b/mmdet/core/anchor/builder.py
new file mode 100755
index 0000000..ddb25ad
--- /dev/null
+++ b/mmdet/core/anchor/builder.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.utils import Registry, build_from_cfg
+
+PRIOR_GENERATORS = Registry('Generator for anchors and points')
+
+ANCHOR_GENERATORS = PRIOR_GENERATORS
+
+
+def build_prior_generator(cfg, default_args=None):
+    return build_from_cfg(cfg, PRIOR_GENERATORS, default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_anchor_generator`` would be deprecated soon, please use '
+        '``build_prior_generator`` ')
+    return build_prior_generator(cfg, default_args=default_args)
diff --git a/mmdet/core/anchor/point_generator.py b/mmdet/core/anchor/point_generator.py
new file mode 100755
index 0000000..cc9c388
--- /dev/null
+++ b/mmdet/core/anchor/point_generator.py
@@ -0,0 +1,263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class PointGenerator:
+
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_points(self, featmap_size, stride=16, device='cuda'):
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0., feat_w, device=device) * stride
+        shift_y = torch.arange(0., feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        stride = shift_x.new_full((shift_xx.shape[0], ), stride)
+        shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self, featmap_size, valid_size, device='cuda'):
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+
+@PRIOR_GENERATORS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self, strides, offset=0.5):
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self):
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self, x, y, row_major=True):
+        yy, xx = torch.meshgrid(y, x)
+        if row_major:
+            # warning .flatten() would cause error in ONNX exporting
+            # have to use reshape here
+            return xx.reshape(-1), yy.reshape(-1)
+
+        else:
+            return yy.reshape(-1), xx.reshape(-1)
+
+    def grid_priors(self,
+                    featmap_sizes,
+                    dtype=torch.float32,
+                    device='cuda',
+                    with_stride=False):
+        """Generate grid points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            dtype (:obj:`dtype`): Dtype of priors. Default: torch.float32.
+            device (str): The device where the anchors will be put on.
+            with_stride (bool): Whether to concatenate the stride to
+                the last dimension of points.
+
+        Return:
+            list[torch.Tensor]: Points of  multiple feature levels.
+            The sizes of each tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_priors = []
+        for i in range(self.num_levels):
+            priors = self.single_level_grid_priors(
+                featmap_sizes[i],
+                level_idx=i,
+                dtype=dtype,
+                device=device,
+                with_stride=with_stride)
+            multi_level_priors.append(priors)
+        return multi_level_priors
+
+    def single_level_grid_priors(self,
+                                 featmap_size,
+                                 level_idx,
+                                 dtype=torch.float32,
+                                 device='cuda',
+                                 with_stride=False):
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            dtype (:obj:`dtype`): Dtype of priors. Default: torch.float32.
+            device (str, optional): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0, feat_w, device=device) +
+                   self.offset) * stride_w
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_x = shift_x.to(dtype)
+
+        shift_y = (torch.arange(0, feat_h, device=device) +
+                   self.offset) * stride_h
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_y = shift_y.to(dtype)
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            # use `shape[0]` instead of `len(shift_xx)` for ONNX export
+            stride_w = shift_xx.new_full((shift_xx.shape[0], ),
+                                         stride_w).to(dtype)
+            stride_h = shift_xx.new_full((shift_yy.shape[0], ),
+                                         stride_h).to(dtype)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h],
+                                 dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self, featmap_sizes, pad_shape, device='cuda'):
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                 arrange as (h, w).
+            device (str): The device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size,
+                                 valid_size,
+                                 device='cuda'):
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str, optional): The device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+    def sparse_priors(self,
+                      prior_idxs,
+                      featmap_size,
+                      level_idx,
+                      dtype=torch.float32,
+                      device='cuda'):
+        """Generate sparse points according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (w, h).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points. Defaults to
+                ``torch.float32``.
+            device (obj:`torch.device`): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 2), N should be equal to
+            the length of ``prior_idxs``. And last dimension
+            2 represent (coord_x, coord_y).
+        """
+        height, width = featmap_size
+        x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+        y = ((prior_idxs // width) % height +
+             self.offset) * self.strides[level_idx][1]
+        prioris = torch.stack([x, y], 1).to(dtype)
+        prioris = prioris.to(device)
+        return prioris
diff --git a/mmdet/core/anchor/utils.py b/mmdet/core/anchor/utils.py
new file mode 100755
index 0000000..c2f2024
--- /dev/null
+++ b/mmdet/core/anchor/utils.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def anchor_inside_flags(flat_anchors,
+                        valid_flags,
+                        img_shape,
+                        allowed_border=0):
+    """Check whether the anchors are inside the border.
+
+    Args:
+        flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+        valid_flags (torch.Tensor): An existing valid flags of anchors.
+        img_shape (tuple(int)): Shape of current image.
+        allowed_border (int, optional): The border to allow the valid anchor.
+            Defaults to 0.
+
+    Returns:
+        torch.Tensor: Flags indicating whether the anchors are inside a \
+            valid range.
+    """
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        inside_flags = valid_flags & \
+            (flat_anchors[:, 0] >= -allowed_border) & \
+            (flat_anchors[:, 1] >= -allowed_border) & \
+            (flat_anchors[:, 2] < img_w + allowed_border) & \
+            (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def calc_region(bbox, ratio, featmap_size=None):
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple): Feature map size used for clipping the boundary.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
diff --git a/mmdet/core/bbox/__init__.py b/mmdet/core/bbox/__init__.py
new file mode 100755
index 0000000..371eba1
--- /dev/null
+++ b/mmdet/core/bbox/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import (AssignResult, BaseAssigner, CenterRegionAssigner,
+                        MaxIoUAssigner, RegionAssigner)
+from .builder import build_assigner, build_bbox_coder, build_sampler
+from .coder import (BaseBBoxCoder, DeltaXYWHBBoxCoder, DistancePointBBoxCoder,
+                    PseudoBBoxCoder, TBLRBBoxCoder)
+from .iou_calculators import BboxOverlaps2D, bbox_overlaps
+from .samplers import (BaseSampler, CombinedSampler,
+                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
+                       OHEMSampler, PseudoSampler, RandomSampler,
+                       SamplingResult, ScoreHLRSampler)
+from .transforms import (bbox2distance, bbox2result, bbox2roi,
+                         bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox_rescale, bbox_xyxy_to_cxcywh,
+                         distance2bbox, find_inside_bboxes, roi2bbox)
+
+__all__ = [
+    'bbox_overlaps', 'BboxOverlaps2D', 'BaseAssigner', 'MaxIoUAssigner',
+    'AssignResult', 'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'build_assigner',
+    'build_sampler', 'bbox_flip', 'bbox_mapping', 'bbox_mapping_back',
+    'bbox2roi', 'roi2bbox', 'bbox2result', 'distance2bbox', 'bbox2distance',
+    'build_bbox_coder', 'BaseBBoxCoder', 'PseudoBBoxCoder',
+    'DeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'DistancePointBBoxCoder',
+    'CenterRegionAssigner', 'bbox_rescale', 'bbox_cxcywh_to_xyxy',
+    'bbox_xyxy_to_cxcywh', 'RegionAssigner', 'find_inside_bboxes'
+]
diff --git a/mmdet/core/bbox/assigners/__init__.py b/mmdet/core/bbox/assigners/__init__.py
new file mode 100755
index 0000000..d6480a7
--- /dev/null
+++ b/mmdet/core/bbox/assigners/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .approx_max_iou_assigner import ApproxMaxIoUAssigner
+from .ascend_assign_result import AscendAssignResult
+from .ascend_max_iou_assigner import AscendMaxIoUAssigner
+from .assign_result import AssignResult
+from .atss_assigner import ATSSAssigner
+from .base_assigner import BaseAssigner
+from .center_region_assigner import CenterRegionAssigner
+from .grid_assigner import GridAssigner
+from .hungarian_assigner import HungarianAssigner
+from .mask_hungarian_assigner import MaskHungarianAssigner
+from .max_iou_assigner import MaxIoUAssigner
+from .point_assigner import PointAssigner
+from .region_assigner import RegionAssigner
+from .sim_ota_assigner import SimOTAAssigner
+from .task_aligned_assigner import TaskAlignedAssigner
+from .uniform_assigner import UniformAssigner
+
+__all__ = [
+    'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult',
+    'PointAssigner', 'ATSSAssigner', 'CenterRegionAssigner', 'GridAssigner',
+    'HungarianAssigner', 'RegionAssigner', 'UniformAssigner', 'SimOTAAssigner',
+    'TaskAlignedAssigner', 'MaskHungarianAssigner', 'AscendAssignResult',
+    'AscendMaxIoUAssigner'
+]
diff --git a/mmdet/core/bbox/assigners/approx_max_iou_assigner.py b/mmdet/core/bbox/assigners/approx_max_iou_assigner.py
new file mode 100755
index 0000000..304d09c
--- /dev/null
+++ b/mmdet/core/bbox/assigners/approx_max_iou_assigner.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class ApproxMaxIoUAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with an integer indicating the ground truth
+     index. (semi-positive index: gt label (0-based), -1: background)
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True,
+                 match_low_quality=True,
+                 gpu_assign_thr=-1,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self,
+               approxs,
+               squares,
+               approxs_per_octave,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None):
+        """Assign gt to approxs.
+
+        This method assign a gt bbox to each group of approxs (bboxes),
+        each group of approxs is represent by a base approx (bbox) and
+        will be assigned with -1, or a semi-positive number.
+        background_label (-1) means negative sample,
+        semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to background_label (-1)
+        2. use the max IoU of each group of approxs to assign
+        2. assign proposals whose iou with all gts < neg_iou_thr to background
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            approxs (Tensor): Bounding boxes to be assigned,
+                shape(approxs_per_octave*n, 4).
+            squares (Tensor): Base Bounding boxes to be assigned,
+                shape(n, 4).
+            approxs_per_octave (int): number of approxs per octave
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_squares = squares.size(0)
+        num_gts = gt_bboxes.size(0)
+
+        if num_squares == 0 or num_gts == 0:
+            # No predictions and/or truth, return empty assignment
+            overlaps = approxs.new(num_gts, num_squares)
+            assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+            return assign_result
+
+        # re-organize anchors by approxs_per_octave x num_squares
+        approxs = torch.transpose(
+            approxs.view(num_squares, approxs_per_octave, 4), 0,
+            1).contiguous().view(-1, 4)
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            num_gts > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = approxs.device
+            approxs = approxs.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+            if gt_labels is not None:
+                gt_labels = gt_labels.cpu()
+        all_overlaps = self.iou_calculator(approxs, gt_bboxes)
+
+        overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares,
+                                        num_gts).max(dim=0)
+        overlaps = torch.transpose(overlaps, 0, 1)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and squares.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    squares, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, squares, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/mmdet/core/bbox/assigners/ascend_assign_result.py b/mmdet/core/bbox/assigners/ascend_assign_result.py
new file mode 100755
index 0000000..03d33c2
--- /dev/null
+++ b/mmdet/core/bbox/assigners/ascend_assign_result.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.utils import util_mixins
+
+
+class AscendAssignResult(util_mixins.NiceRepr):
+    """Stores ascend assignments between predicted and truth boxes.
+
+    Arguments:
+        batch_num_gts (list[int]): the number of truth boxes considered.
+        batch_pos_mask (IntTensor): Positive samples mask in all images.
+        batch_neg_mask (IntTensor): Negative samples mask in all images.
+        batch_max_overlaps (FloatTensor): The max overlaps of all bboxes
+            and ground truth boxes.
+        batch_anchor_gt_indes(None | LongTensor): The assigned truth
+            box index of all anchors.
+        batch_anchor_gt_labels(None | LongTensor): The gt labels
+            of all anchors
+    """
+
+    def __init__(self,
+                 batch_num_gts,
+                 batch_pos_mask,
+                 batch_neg_mask,
+                 batch_max_overlaps,
+                 batch_anchor_gt_indes=None,
+                 batch_anchor_gt_labels=None):
+        self.batch_num_gts = batch_num_gts
+        self.batch_pos_mask = batch_pos_mask
+        self.batch_neg_mask = batch_neg_mask
+        self.batch_max_overlaps = batch_max_overlaps
+        self.batch_anchor_gt_indes = batch_anchor_gt_indes
+        self.batch_anchor_gt_labels = batch_anchor_gt_labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
diff --git a/mmdet/core/bbox/assigners/ascend_max_iou_assigner.py b/mmdet/core/bbox/assigners/ascend_max_iou_assigner.py
new file mode 100755
index 0000000..f8f528a
--- /dev/null
+++ b/mmdet/core/bbox/assigners/ascend_max_iou_assigner.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ....utils import masked_fill
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .ascend_assign_result import AscendAssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class AscendMaxIoUAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples. It brings about 0.3 mAP
+            improvements in 1x schedule but does not affect the performance of
+            3x schedule. More comparisons can be found in
+            `PR #7464 <https://github.com/open-mmlab/mmdetection/pull/7464>`_.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True,
+                 match_low_quality=True,
+                 gpu_assign_thr=-1,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self,
+               batch_bboxes,
+               batch_gt_bboxes,
+               batch_gt_bboxes_ignore=None,
+               batch_gt_labels=None,
+               batch_bboxes_ignore_mask=None,
+               batch_num_gts=None):
+        """Assign gt to bboxes.
+
+        Args:
+            batch_bboxes (Tensor): Bounding boxes to be assigned,
+                shape(b, n, 4).
+            batch_gt_bboxes (Tensor): Ground truth boxes,
+                shape (b, k, 4).
+            batch_gt_bboxes_ignore (Tensor, optional): Ground truth
+                bboxes that are labelled as `ignored`,
+                e.g., crowd boxes in COCO.
+            batch_gt_labels (Tensor, optional): Label of gt_bboxes,
+                shape (b, k, ).
+            batch_bboxes_ignore_mask: (b, n)
+            batch_num_gts:(b, )
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        batch_overlaps = self.iou_calculator(batch_gt_bboxes, batch_bboxes)
+        batch_overlaps = masked_fill(
+            batch_overlaps,
+            batch_bboxes_ignore_mask.unsqueeze(1).float(),
+            -1,
+            neg=True)
+        if self.ignore_iof_thr > 0 and batch_gt_bboxes_ignore is not None:
+            if self.ignore_wrt_candidates:
+                batch_ignore_overlaps = self.iou_calculator(
+                    batch_bboxes, batch_gt_bboxes_ignore, mode='iof')
+                batch_ignore_overlaps = masked_fill(batch_ignore_overlaps,
+                                                    batch_bboxes_ignore_mask,
+                                                    -1)
+                batch_ignore_max_overlaps, _ = batch_ignore_overlaps.max(dim=2)
+            else:
+                batch_ignore_overlaps = self.iou_calculator(
+                    batch_gt_bboxes_ignore, batch_bboxes, mode='iof')
+                batch_ignore_overlaps = masked_fill(batch_ignore_overlaps,
+                                                    batch_bboxes_ignore_mask,
+                                                    -1)
+                batch_ignore_max_overlaps, _ = \
+                    batch_ignore_overlaps.max(dim=1)
+            batch_ignore_mask = \
+                batch_ignore_max_overlaps > self.ignore_iof_thr
+            batch_overlaps = masked_fill(batch_overlaps, batch_ignore_mask, -1)
+        batch_assign_result = self.batch_assign_wrt_overlaps(
+            batch_overlaps, batch_gt_labels, batch_num_gts)
+        return batch_assign_result
+
+    def batch_assign_wrt_overlaps(self,
+                                  batch_overlaps,
+                                  batch_gt_labels=None,
+                                  batch_num_gts=None):
+        num_images, num_gts, num_bboxes = batch_overlaps.size()
+        batch_max_overlaps, batch_argmax_overlaps = batch_overlaps.max(dim=1)
+        if isinstance(self.neg_iou_thr, float):
+            batch_neg_mask = \
+                ((batch_max_overlaps >= 0)
+                 & (batch_max_overlaps < self.neg_iou_thr)).int()
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            batch_neg_mask = \
+                ((batch_max_overlaps >= self.neg_iou_thr[0])
+                 & (batch_max_overlaps < self.neg_iou_thr[1])).int()
+        else:
+            batch_neg_mask = torch.zeros(
+                batch_max_overlaps.size(),
+                dtype=torch.int,
+                device=batch_max_overlaps.device)
+        batch_pos_mask = (batch_max_overlaps >= self.pos_iou_thr).int()
+        if self.match_low_quality:
+            batch_gt_max_overlaps, batch_gt_argmax_overlaps = \
+                batch_overlaps.max(dim=2)
+            batch_index_bool = (batch_gt_max_overlaps >= self.min_pos_iou) & \
+                               (batch_gt_max_overlaps > 0)
+            if self.gt_max_assign_all:
+                pos_inds_low_quality = \
+                    (batch_overlaps == batch_gt_max_overlaps.unsqueeze(2)) & \
+                    batch_index_bool.unsqueeze(2)
+                for i in range(num_gts):
+                    pos_inds_low_quality_gt = pos_inds_low_quality[:, i, :]
+                    batch_argmax_overlaps[pos_inds_low_quality_gt] = i
+                    batch_pos_mask[pos_inds_low_quality_gt] = 1
+            else:
+                index_temp = torch.arange(
+                    0, num_gts, device=batch_max_overlaps.device)
+                for index_image in range(num_images):
+                    gt_argmax_overlaps = batch_gt_argmax_overlaps[index_image]
+                    index_bool = batch_index_bool[index_image]
+                    pos_inds_low_quality = gt_argmax_overlaps[index_bool]
+                    batch_argmax_overlaps[index_image][pos_inds_low_quality] \
+                        = index_temp[index_bool]
+                    batch_pos_mask[index_image][pos_inds_low_quality] = 1
+        batch_neg_mask = batch_neg_mask * (1 - batch_pos_mask)
+        if batch_gt_labels is not None:
+            batch_anchor_gt_labels = torch.zeros((num_images, num_bboxes),
+                                                 dtype=batch_gt_labels.dtype,
+                                                 device=batch_gt_labels.device)
+            for index_image in range(num_images):
+                batch_anchor_gt_labels[index_image] = torch.index_select(
+                    batch_gt_labels[index_image], 0,
+                    batch_argmax_overlaps[index_image])
+        else:
+            batch_anchor_gt_labels = None
+        return AscendAssignResult(batch_num_gts, batch_pos_mask,
+                                  batch_neg_mask, batch_max_overlaps,
+                                  batch_argmax_overlaps,
+                                  batch_anchor_gt_labels)
diff --git a/mmdet/core/bbox/assigners/assign_result.py b/mmdet/core/bbox/assigners/assign_result.py
new file mode 100755
index 0000000..488010b
--- /dev/null
+++ b/mmdet/core/bbox/assigners/assign_result.py
@@ -0,0 +1,206 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assigned to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmdet.core.bbox.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from mmdet.core.bbox import demodata
+        rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        p_use_label = kwargs.get('p_use_label', 0.5)
+        num_classes = kwargs.get('p_use_label', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            if p_use_label is True or p_use_label < rng.rand():
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = None
+        else:
+            import numpy as np
+
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned].long()
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if p_use_label is True or p_use_label < rng.rand():
+                if num_classes == 0:
+                    labels = torch.zeros(num_preds, dtype=torch.int64)
+                else:
+                    labels = torch.from_numpy(
+                        # remind that we set FG labels to [0, num_class-1]
+                        # since mmdet v2.0
+                        # BG cat_id: num_class
+                        rng.randint(0, num_classes, size=num_preds))
+                    labels[~is_assigned] = 0
+            else:
+                labels = None
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        if self.labels is not None:
+            self.labels = torch.cat([gt_labels, self.labels])
diff --git a/mmdet/core/bbox/assigners/atss_assigner.py b/mmdet/core/bbox/assigners/atss_assigner.py
new file mode 100755
index 0000000..79c8281
--- /dev/null
+++ b/mmdet/core/bbox/assigners/atss_assigner.py
@@ -0,0 +1,234 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class ATSSAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    If ``alpha`` is not None, it means that the dynamic cost
+    ATSSAssigner is adopted, which is currently only used in the DDOD.
+
+    Args:
+        topk (float): number of bbox selected in each level
+    """
+
+    def __init__(self,
+                 topk,
+                 alpha=None,
+                 iou_calculator=dict(type='BboxOverlaps2D'),
+                 ignore_iof_thr=-1):
+        self.topk = topk
+        self.alpha = alpha
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+        self.ignore_iof_thr = ignore_iof_thr
+
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Args:
+        topk (int): number of bbox selected in each level.
+        alpha (float): param of cost rate for each proposal only in DDOD.
+            Default None.
+        iou_calculator (dict): builder of IoU calculator.
+            Default dict(type='BboxOverlaps2D').
+        ignore_iof_thr (int): whether ignore max overlaps or not.
+            Default -1 (1 or -1).
+    """
+
+    # https://github.com/sfzhang15/ATSS/blob/master/atss_core/modeling/rpn/atss/loss.py
+    def assign(self,
+               bboxes,
+               num_level_bboxes,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None,
+               cls_scores=None,
+               bbox_preds=None):
+        """Assign gt to bboxes.
+
+        The assignment is done in following steps
+
+        1. compute iou between all bbox (bbox of all pyramid levels) and gt
+        2. compute center distance between all bbox and gt
+        3. on each pyramid level, for each gt, select k bbox whose center
+           are closest to the gt center, so we total select k*l bbox as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as positive
+        6. limit the positive sample's center in gt
+
+        If ``alpha`` is not None, and ``cls_scores`` and `bbox_preds`
+        are not None, the overlaps calculation in the first step
+        will also include dynamic cost, which is currently only used in
+        the DDOD.
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            num_level_bboxes (List): num of bboxes in each level
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO. Default None.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+            cls_scores (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes. Default None.
+            bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4. Default None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        INF = 100000000
+        bboxes = bboxes[:, :4]
+        num_gt, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
+
+        message = 'Invalid alpha parameter because cls_scores or ' \
+                  'bbox_preds are None. If you want to use the ' \
+                  'cost-based ATSSAssigner,  please set cls_scores, ' \
+                  'bbox_preds and self.alpha at the same time. '
+
+        if self.alpha is None:
+            # ATSSAssigner
+            overlaps = self.iou_calculator(bboxes, gt_bboxes)
+            if cls_scores is not None or bbox_preds is not None:
+                warnings.warn(message)
+        else:
+            # Dynamic cost ATSSAssigner in DDOD
+            assert cls_scores is not None and bbox_preds is not None, message
+
+            # compute cls cost for bbox and GT
+            cls_cost = torch.sigmoid(cls_scores[:, gt_labels])
+
+            # compute iou between all bbox and gt
+            overlaps = self.iou_calculator(bbox_preds, gt_bboxes)
+
+            # make sure that we are in element-wise multiplication
+            assert cls_cost.shape == overlaps.shape
+
+            # overlaps is actually a cost matrix
+            overlaps = cls_cost**(1 - self.alpha) * overlaps**self.alpha
+
+        # assign 0 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             0,
+                                             dtype=torch.long)
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        # compute center distance between all bbox and gt
+        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        gt_points = torch.stack((gt_cx, gt_cy), dim=1)
+
+        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+        bboxes_points = torch.stack((bboxes_cx, bboxes_cy), dim=1)
+
+        distances = (bboxes_points[:, None, :] -
+                     gt_points[None, :, :]).pow(2).sum(-1).sqrt()
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and bboxes.numel() > 0):
+            ignore_overlaps = self.iou_calculator(
+                bboxes, gt_bboxes_ignore, mode='iof')
+            ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            ignore_idxs = ignore_max_overlaps > self.ignore_iof_thr
+            distances[ignore_idxs, :] = INF
+            assigned_gt_inds[ignore_idxs] = -1
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        start_idx = 0
+        for level, bboxes_per_level in enumerate(num_level_bboxes):
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + bboxes_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_k = min(self.topk, bboxes_per_level)
+
+            _, topk_idxs_per_level = distances_per_level.topk(
+                selectable_k, dim=0, largest=False)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+        candidate_idxs = torch.cat(candidate_idxs, dim=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps = overlaps[candidate_idxs, torch.arange(num_gt)]
+        overlaps_mean_per_gt = candidate_overlaps.mean(0)
+        overlaps_std_per_gt = candidate_overlaps.std(0)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_bboxes_cx = bboxes_cx.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        ep_bboxes_cy = bboxes_cy.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_bboxes_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_bboxes_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/mmdet/core/bbox/assigners/base_assigner.py b/mmdet/core/bbox/assigners/base_assigner.py
new file mode 100755
index 0000000..3c2d597
--- /dev/null
+++ b/mmdet/core/bbox/assigners/base_assigner.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    @abstractmethod
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
diff --git a/mmdet/core/bbox/assigners/center_region_assigner.py b/mmdet/core/bbox/assigners/center_region_assigner.py
new file mode 100755
index 0000000..86e7859
--- /dev/null
+++ b/mmdet/core/bbox/assigners/center_region_assigner.py
@@ -0,0 +1,336 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def scale_boxes(bboxes, scale):
+    """Expand an array of boxes by a given scale.
+
+    Args:
+        bboxes (Tensor): Shape (m, 4)
+        scale (float): The scale factor of bboxes
+
+    Returns:
+        (Tensor): Shape (m, 4). Scaled bboxes
+    """
+    assert bboxes.size(1) == 4
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_scaled = torch.zeros_like(bboxes)
+    boxes_scaled[:, 0] = x_c - w_half
+    boxes_scaled[:, 2] = x_c + w_half
+    boxes_scaled[:, 1] = y_c - h_half
+    boxes_scaled[:, 3] = y_c + h_half
+    return boxes_scaled
+
+
+def is_located_in(points, bboxes):
+    """Are points located in bboxes.
+
+    Args:
+      points (Tensor): Points, shape: (m, 2).
+      bboxes (Tensor): Bounding boxes, shape: (n, 4).
+
+    Return:
+      Tensor: Flags indicating if points are located in bboxes, shape: (m, n).
+    """
+    assert points.size(1) == 2
+    assert bboxes.size(1) == 4
+    return (points[:, 0].unsqueeze(1) > bboxes[:, 0].unsqueeze(0)) & \
+           (points[:, 0].unsqueeze(1) < bboxes[:, 2].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) > bboxes[:, 1].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) < bboxes[:, 3].unsqueeze(0))
+
+
+def bboxes_area(bboxes):
+    """Compute the area of an array of bboxes.
+
+    Args:
+        bboxes (Tensor): The coordinates ox bboxes. Shape: (m, 4)
+
+    Returns:
+        Tensor: Area of the bboxes. Shape: (m, )
+    """
+    assert bboxes.size(1) == 4
+    w = (bboxes[:, 2] - bboxes[:, 0])
+    h = (bboxes[:, 3] - bboxes[:, 1])
+    areas = w * h
+    return areas
+
+
+@BBOX_ASSIGNERS.register_module()
+class CenterRegionAssigner(BaseAssigner):
+    """Assign pixels at the center region of a bbox as positive.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+    - -1: negative samples
+    - semi-positive numbers: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_scale (float): Threshold within which pixels are
+          labelled as positive.
+        neg_scale (float): Threshold above which pixels are
+          labelled as positive.
+        min_pos_iof (float): Minimum iof of a pixel with a gt to be
+          labelled as positive. Default: 1e-2
+        ignore_gt_scale (float): Threshold within which the pixels
+          are ignored when the gt is labelled as shadowed. Default: 0.5
+        foreground_dominate (bool): If True, the bbox will be assigned as
+          positive when a gt's kernel region overlaps with another's shadowed
+          (ignored) region, otherwise it is set as ignored. Default to False.
+    """
+
+    def __init__(self,
+                 pos_scale,
+                 neg_scale,
+                 min_pos_iof=1e-2,
+                 ignore_gt_scale=0.5,
+                 foreground_dominate=False,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_scale = pos_scale
+        self.neg_scale = neg_scale
+        self.min_pos_iof = min_pos_iof
+        self.ignore_gt_scale = ignore_gt_scale
+        self.foreground_dominate = foreground_dominate
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def get_gt_priorities(self, gt_bboxes):
+        """Get gt priorities according to their areas.
+
+        Smaller gt has higher priority.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth boxes, shape (k, 4).
+
+        Returns:
+            Tensor: The priority of gts so that gts with larger priority is \
+              more likely to be assigned. Shape (k, )
+        """
+        gt_areas = bboxes_area(gt_bboxes)
+        # Rank all gt bbox areas. Smaller objects has larger priority
+        _, sort_idx = gt_areas.sort(descending=True)
+        sort_idx = sort_idx.argsort()
+        return sort_idx
+
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign gt to bboxes.
+
+        This method assigns gts to every bbox (proposal/anchor), each bbox \
+        will be assigned with -1, or a semi-positive number. -1 means \
+        negative sample, semi-positive number is the index (0-based) of \
+        assigned gt.
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (tensor, optional): Ground truth bboxes that are
+              labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (tensor, optional): Label of gt_bboxes, shape (num_gts,).
+
+        Returns:
+            :obj:`AssignResult`: The assigned result. Note that \
+              shadowed_labels of shape (N, 2) is also added as an \
+              `assign_result` attribute. `shadowed_labels` is a tensor \
+              composed of N pairs of anchor_ind, class_label], where N \
+              is the number of anchors that lie in the outer region of a \
+              gt, anchor_ind is the shadowed anchor index and class_label \
+              is the shadowed class label.
+
+        Example:
+            >>> self = CenterRegionAssigner(0.2, 0.2)
+            >>> bboxes = torch.Tensor([[0, 0, 10, 10], [10, 10, 20, 20]])
+            >>> gt_bboxes = torch.Tensor([[0, 0, 10, 10]])
+            >>> assign_result = self.assign(bboxes, gt_bboxes)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        # There are in total 5 steps in the pixel assignment
+        # 1. Find core (the center region, say inner 0.2)
+        #     and shadow (the relatively ourter part, say inner 0.2-0.5)
+        #     regions of every gt.
+        # 2. Find all prior bboxes that lie in gt_core and gt_shadow regions
+        # 3. Assign prior bboxes in gt_core with a one-hot id of the gt in
+        #      the image.
+        #    3.1. For overlapping objects, the prior bboxes in gt_core is
+        #           assigned with the object with smallest area
+        # 4. Assign prior bboxes with class label according to its gt id.
+        #    4.1. Assign -1 to prior bboxes lying in shadowed gts
+        #    4.2. Assign positive prior boxes with the corresponding label
+        # 5. Find pixels lying in the shadow of an object and assign them with
+        #      background label, but set the loss weight of its corresponding
+        #      gt to zero.
+        assert bboxes.size(1) == 4, 'bboxes must have size of 4'
+        # 1. Find core positive and shadow region of every gt
+        gt_core = scale_boxes(gt_bboxes, self.pos_scale)
+        gt_shadow = scale_boxes(gt_bboxes, self.neg_scale)
+
+        # 2. Find prior bboxes that lie in gt_core and gt_shadow regions
+        bbox_centers = (bboxes[:, 2:4] + bboxes[:, 0:2]) / 2
+        # The center points lie within the gt boxes
+        is_bbox_in_gt = is_located_in(bbox_centers, gt_bboxes)
+        # Only calculate bbox and gt_core IoF. This enables small prior bboxes
+        #   to match large gts
+        bbox_and_gt_core_overlaps = self.iou_calculator(
+            bboxes, gt_core, mode='iof')
+        # The center point of effective priors should be within the gt box
+        is_bbox_in_gt_core = is_bbox_in_gt & (
+            bbox_and_gt_core_overlaps > self.min_pos_iof)  # shape (n, k)
+
+        is_bbox_in_gt_shadow = (
+            self.iou_calculator(bboxes, gt_shadow, mode='iof') >
+            self.min_pos_iof)
+        # Rule out center effective positive pixels
+        is_bbox_in_gt_shadow &= (~is_bbox_in_gt_core)
+
+        num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
+        if num_gts == 0 or num_bboxes == 0:
+            # If no gts exist, assign all pixels to negative
+            assigned_gt_ids = \
+                is_bbox_in_gt_core.new_zeros((num_bboxes,),
+                                             dtype=torch.long)
+            pixels_in_gt_shadow = assigned_gt_ids.new_empty((0, 2))
+        else:
+            # Step 3: assign a one-hot gt id to each pixel, and smaller objects
+            #    have high priority to assign the pixel.
+            sort_idx = self.get_gt_priorities(gt_bboxes)
+            assigned_gt_ids, pixels_in_gt_shadow = \
+                self.assign_one_hot_gt_indices(is_bbox_in_gt_core,
+                                               is_bbox_in_gt_shadow,
+                                               gt_priority=sort_idx)
+
+        if gt_bboxes_ignore is not None and gt_bboxes_ignore.numel() > 0:
+            # No ground truth or boxes, return empty assignment
+            gt_bboxes_ignore = scale_boxes(
+                gt_bboxes_ignore, scale=self.ignore_gt_scale)
+            is_bbox_in_ignored_gts = is_located_in(bbox_centers,
+                                                   gt_bboxes_ignore)
+            is_bbox_in_ignored_gts = is_bbox_in_ignored_gts.any(dim=1)
+            assigned_gt_ids[is_bbox_in_ignored_gts] = -1
+
+        # 4. Assign prior bboxes with class label according to its gt id.
+        assigned_labels = None
+        shadowed_pixel_labels = None
+        if gt_labels is not None:
+            # Default assigned label is the background (-1)
+            assigned_labels = assigned_gt_ids.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_ids > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[assigned_gt_ids[pos_inds]
+                                                      - 1]
+            # 5. Find pixels lying in the shadow of an object
+            shadowed_pixel_labels = pixels_in_gt_shadow.clone()
+            if pixels_in_gt_shadow.numel() > 0:
+                pixel_idx, gt_idx =\
+                    pixels_in_gt_shadow[:, 0], pixels_in_gt_shadow[:, 1]
+                assert (assigned_gt_ids[pixel_idx] != gt_idx).all(), \
+                    'Some pixels are dually assigned to ignore and gt!'
+                shadowed_pixel_labels[:, 1] = gt_labels[gt_idx - 1]
+                override = (
+                    assigned_labels[pixel_idx] == shadowed_pixel_labels[:, 1])
+                if self.foreground_dominate:
+                    # When a pixel is both positive and shadowed, set it as pos
+                    shadowed_pixel_labels = shadowed_pixel_labels[~override]
+                else:
+                    # When a pixel is both pos and shadowed, set it as shadowed
+                    assigned_labels[pixel_idx[override]] = -1
+                    assigned_gt_ids[pixel_idx[override]] = 0
+
+        assign_result = AssignResult(
+            num_gts, assigned_gt_ids, None, labels=assigned_labels)
+        # Add shadowed_labels as assign_result property. Shape: (num_shadow, 2)
+        assign_result.set_extra_property('shadowed_labels',
+                                         shadowed_pixel_labels)
+        return assign_result
+
+    def assign_one_hot_gt_indices(self,
+                                  is_bbox_in_gt_core,
+                                  is_bbox_in_gt_shadow,
+                                  gt_priority=None):
+        """Assign only one gt index to each prior box.
+
+        Gts with large gt_priority are more likely to be assigned.
+
+        Args:
+            is_bbox_in_gt_core (Tensor): Bool tensor indicating the bbox center
+              is in the core area of a gt (e.g. 0-0.2).
+              Shape: (num_prior, num_gt).
+            is_bbox_in_gt_shadow (Tensor): Bool tensor indicating the bbox
+              center is in the shadowed area of a gt (e.g. 0.2-0.5).
+              Shape: (num_prior, num_gt).
+            gt_priority (Tensor): Priorities of gts. The gt with a higher
+              priority is more likely to be assigned to the bbox when the bbox
+              match with multiple gts. Shape: (num_gt, ).
+
+        Returns:
+            tuple: Returns (assigned_gt_inds, shadowed_gt_inds).
+
+                - assigned_gt_inds: The assigned gt index of each prior bbox \
+                    (i.e. index from 1 to num_gts). Shape: (num_prior, ).
+                - shadowed_gt_inds: shadowed gt indices. It is a tensor of \
+                    shape (num_ignore, 2) with first column being the \
+                    shadowed prior bbox indices and the second column the \
+                    shadowed gt indices (1-based).
+        """
+        num_bboxes, num_gts = is_bbox_in_gt_core.shape
+
+        if gt_priority is None:
+            gt_priority = torch.arange(
+                num_gts, device=is_bbox_in_gt_core.device)
+        assert gt_priority.size(0) == num_gts
+        # The bigger gt_priority, the more preferable to be assigned
+        # The assigned inds are by default 0 (background)
+        assigned_gt_inds = is_bbox_in_gt_core.new_zeros((num_bboxes, ),
+                                                        dtype=torch.long)
+        # Shadowed bboxes are assigned to be background. But the corresponding
+        #   label is ignored during loss calculation, which is done through
+        #   shadowed_gt_inds
+        shadowed_gt_inds = torch.nonzero(is_bbox_in_gt_shadow, as_tuple=False)
+        if is_bbox_in_gt_core.sum() == 0:  # No gt match
+            shadowed_gt_inds[:, 1] += 1  # 1-based. For consistency issue
+            return assigned_gt_inds, shadowed_gt_inds
+
+        # The priority of each prior box and gt pair. If one prior box is
+        #  matched bo multiple gts. Only the pair with the highest priority
+        #  is saved
+        pair_priority = is_bbox_in_gt_core.new_full((num_bboxes, num_gts),
+                                                    -1,
+                                                    dtype=torch.long)
+
+        # Each bbox could match with multiple gts.
+        # The following codes deal with this situation
+        # Matched  bboxes (to any gt). Shape: (num_pos_anchor, )
+        inds_of_match = torch.any(is_bbox_in_gt_core, dim=1)
+        # The matched gt index of each positive bbox. Length >= num_pos_anchor
+        #   , since one bbox could match multiple gts
+        matched_bbox_gt_inds = torch.nonzero(
+            is_bbox_in_gt_core, as_tuple=False)[:, 1]
+        # Assign priority to each bbox-gt pair.
+        pair_priority[is_bbox_in_gt_core] = gt_priority[matched_bbox_gt_inds]
+        _, argmax_priority = pair_priority[inds_of_match].max(dim=1)
+        assigned_gt_inds[inds_of_match] = argmax_priority + 1  # 1-based
+        # Zero-out the assigned anchor box to filter the shadowed gt indices
+        is_bbox_in_gt_core[inds_of_match, argmax_priority] = 0
+        # Concat the shadowed indices due to overlapping with that out side of
+        #   effective scale. shape: (total_num_ignore, 2)
+        shadowed_gt_inds = torch.cat(
+            (shadowed_gt_inds, torch.nonzero(
+                is_bbox_in_gt_core, as_tuple=False)),
+            dim=0)
+        # `is_bbox_in_gt_core` should be changed back to keep arguments intact.
+        is_bbox_in_gt_core[inds_of_match, argmax_priority] = 1
+        # 1-based shadowed gt indices, to be consistent with `assigned_gt_inds`
+        if shadowed_gt_inds.numel() > 0:
+            shadowed_gt_inds[:, 1] += 1
+        return assigned_gt_inds, shadowed_gt_inds
diff --git a/mmdet/core/bbox/assigners/grid_assigner.py b/mmdet/core/bbox/assigners/grid_assigner.py
new file mode 100755
index 0000000..a0c814e
--- /dev/null
+++ b/mmdet/core/bbox/assigners/grid_assigner.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class GridAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self, bboxes, box_responsible_flags, gt_bboxes, gt_labels=None):
+        """Assign gt to bboxes. The process is very much like the max iou
+        assigner, except that positive samples are constrained within the cell
+        that the gt boxes fell in.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. assign proposals whose iou with all gts <= neg_iou_thr to 0
+        3. for each bbox within a cell, if the iou with its nearest gt >
+            pos_iou_thr and the center of that gt falls inside the cell,
+            assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals within the cell the
+            gt bbox falls in to itself.
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            box_responsible_flags (Tensor): flag to indicate whether box is
+                responsible for prediction, shape(n, )
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
+
+        # compute iou between all gt and bboxes
+        overlaps = self.iou_calculator(gt_bboxes, bboxes)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # 2. assign negative: below
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        # shape of max_overlaps == argmax_overlaps == num_bboxes
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps <= self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, (tuple, list)):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps > self.neg_iou_thr[0])
+                             & (max_overlaps <= self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: falls into responsible cell and above
+        # positive IOU threshold, the order matters.
+        # the prior condition of comparison is to filter out all
+        # unrelated anchors, i.e. not box_responsible_flags
+        overlaps[:, ~box_responsible_flags.type(torch.bool)] = -1.
+
+        # calculate max_overlaps again, but this time we only consider IOUs
+        # for anchors responsible for prediction
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        # shape of gt_max_overlaps == gt_argmax_overlaps == num_gts
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        pos_inds = (max_overlaps >
+                    self.pos_iou_thr) & box_responsible_flags.type(torch.bool)
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        # 4. assign positive to max overlapped anchors within responsible cell
+        for i in range(num_gts):
+            if gt_max_overlaps[i] > self.min_pos_iou:
+                if self.gt_max_assign_all:
+                    max_iou_inds = (overlaps[i, :] == gt_max_overlaps[i]) & \
+                         box_responsible_flags.type(torch.bool)
+                    assigned_gt_inds[max_iou_inds] = i + 1
+                elif box_responsible_flags[gt_argmax_overlaps[i]]:
+                    assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        # assign labels of positive anchors
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/mmdet/core/bbox/assigners/hungarian_assigner.py b/mmdet/core/bbox/assigners/hungarian_assigner.py
new file mode 100755
index 0000000..435612a
--- /dev/null
+++ b/mmdet/core/bbox/assigners/hungarian_assigner.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from scipy.optimize import linear_sum_assignment
+
+from ..builder import BBOX_ASSIGNERS
+from ..match_costs import build_match_cost
+from ..transforms import bbox_cxcywh_to_xyxy
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes,
+               gt_labels,
+               img_meta,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+        img_h, img_w, _ = img_meta['img_shape']
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        normalize_gt_bboxes = gt_bboxes / factor
+        reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
+        # regression iou cost, defaultly giou is used in official DETR.
+        bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
+        iou_cost = self.iou_cost(bboxes, gt_bboxes)
+        # weighted sum of above three costs
+        cost = cls_cost + reg_cost + iou_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmdet/core/bbox/assigners/mask_hungarian_assigner.py b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
new file mode 100755
index 0000000..d83def1
--- /dev/null
+++ b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from scipy.optimize import linear_sum_assignment
+
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.match_costs.builder import build_match_cost
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class MaskHungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth for
+    mask.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, mask focal cost and mask dice cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_cost (:obj:`mmcv.ConfigDict` | dict): Classification cost config.
+        mask_cost (:obj:`mmcv.ConfigDict` | dict): Mask cost config.
+        dice_cost (:obj:`mmcv.ConfigDict` | dict): Dice cost config.
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.0),
+                 mask_cost=dict(
+                     type='FocalLossCost', weight=1.0, binary_input=True),
+                 dice_cost=dict(type='DiceCost', weight=1.0)):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.mask_cost = build_match_cost(mask_cost)
+        self.dice_cost = build_match_cost(dice_cost)
+
+    def assign(self,
+               cls_pred,
+               mask_pred,
+               gt_labels,
+               gt_mask,
+               img_meta,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        Args:
+            cls_pred (Tensor | None): Class prediction in shape
+                (num_query, cls_out_channels).
+            mask_pred (Tensor): Mask prediction in shape (num_query, H, W).
+            gt_labels (Tensor): Label of 'gt_mask'in shape = (num_gt, ).
+            gt_mask (Tensor): Ground truth mask in shape = (num_gt, H, W).
+            img_meta (dict): Meta information for current image.
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        # K-Net sometimes passes cls_pred=None to this assigner.
+        # So we should use the shape of mask_pred
+        num_gt, num_query = gt_labels.shape[0], mask_pred.shape[0]
+
+        # 1. assign -1 by default
+        assigned_gt_inds = mask_pred.new_full((num_query, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = mask_pred.new_full((num_query, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gt == 0 or num_query == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gt == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gt, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and maskcost.
+        if self.cls_cost.weight != 0 and cls_pred is not None:
+            cls_cost = self.cls_cost(cls_pred, gt_labels)
+        else:
+            cls_cost = 0
+
+        if self.mask_cost.weight != 0:
+            # mask_pred shape = [num_query, h, w]
+            # gt_mask shape = [num_gt, h, w]
+            # mask_cost shape = [num_query, num_gt]
+            mask_cost = self.mask_cost(mask_pred, gt_mask)
+        else:
+            mask_cost = 0
+
+        if self.dice_cost.weight != 0:
+            dice_cost = self.dice_cost(mask_pred, gt_mask)
+        else:
+            dice_cost = 0
+        cost = cls_cost + mask_cost + dice_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            mask_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            mask_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gt, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmdet/core/bbox/assigners/max_iou_assigner.py b/mmdet/core/bbox/assigners/max_iou_assigner.py
new file mode 100755
index 0000000..676421f
--- /dev/null
+++ b/mmdet/core/bbox/assigners/max_iou_assigner.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class MaxIoUAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples. It brings about 0.3 mAP
+            improvements in 1x schedule but does not affect the performance of
+            3x schedule. More comparisons can be found in
+            `PR #7464 <https://github.com/open-mmlab/mmdetection/pull/7464>`_.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True,
+                 match_low_quality=True,
+                 gpu_assign_thr=-1,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, or a semi-positive number. -1 means negative
+        sample, semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to the background
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+
+        Example:
+            >>> self = MaxIoUAssigner(0.5, 0.5)
+            >>> bboxes = torch.Tensor([[0, 0, 10, 10], [10, 10, 20, 20]])
+            >>> gt_bboxes = torch.Tensor([[0, 0, 10, 9]])
+            >>> assign_result = self.assign(bboxes, gt_bboxes)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = bboxes.device
+            bboxes = bboxes.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+            if gt_labels is not None:
+                gt_labels = gt_labels.cpu()
+
+        overlaps = self.iou_calculator(gt_bboxes, bboxes)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and bboxes.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    bboxes, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, bboxes, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps, gt_labels=None):
+        """Assign w.r.t. the overlaps of bboxes with gts.
+
+        Args:
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
+                shape(k, n).
+            gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        # 2. assign negative: below
+        # the negative inds are set to be 0
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps < self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
+                             & (max_overlaps < self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: above positive IoU threshold
+        pos_inds = max_overlaps >= self.pos_iou_thr
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        if self.match_low_quality:
+            # Low-quality matching will overwrite the assigned_gt_inds assigned
+            # in Step 3. Thus, the assigned gt might not be the best one for
+            # prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox 1 & 2,
+            # bbox 1 will be assigned as the best target for bbox A in step 3.
+            # However, if GT bbox 2's gt_argmax_overlaps = A, bbox A's
+            # assigned_gt_inds will be overwritten to be bbox 2.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
+                        assigned_gt_inds[max_iou_inds] = i + 1
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/mmdet/core/bbox/assigners/point_assigner.py b/mmdet/core/bbox/assigners/point_assigner.py
new file mode 100755
index 0000000..b0dc224
--- /dev/null
+++ b/mmdet/core/bbox/assigners/point_assigner.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class PointAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each point.
+
+    Each proposals will be assigned with `0`, or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    """
+
+    def __init__(self, scale=4, pos_num=3):
+        self.scale = scale
+        self.pos_num = pos_num
+
+    def assign(self, points, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign gt to points.
+
+        This method assign a gt bbox to every points set, each points set
+        will be assigned with  the background_label (-1), or a label number.
+        -1 is background, and semi-positive number is the index (0-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every points to the background_label (-1)
+        2. A point is assigned to some gt bbox if
+            (i) the point is within the k closest points to the gt bbox
+            (ii) the distance between this point and the gt is smaller than
+                other gt bboxes
+
+        Args:
+            points (Tensor): points to be assigned, shape(n, 3) while last
+                dimension stands for (x, y, stride).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+                NOTE: currently unused.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_points = points.shape[0]
+        num_gts = gt_bboxes.shape[0]
+
+        if num_gts == 0 or num_points == 0:
+            # If no truth assign everything to the background
+            assigned_gt_inds = points.new_full((num_points, ),
+                                               0,
+                                               dtype=torch.long)
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = points.new_full((num_points, ),
+                                                  -1,
+                                                  dtype=torch.long)
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        points_xy = points[:, :2]
+        points_stride = points[:, 2]
+        points_lvl = torch.log2(
+            points_stride).int()  # [3...,4...,5...,6...,7...]
+        lvl_min, lvl_max = points_lvl.min(), points_lvl.max()
+
+        # assign gt box
+        gt_bboxes_xy = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2
+        gt_bboxes_wh = (gt_bboxes[:, 2:] - gt_bboxes[:, :2]).clamp(min=1e-6)
+        scale = self.scale
+        gt_bboxes_lvl = ((torch.log2(gt_bboxes_wh[:, 0] / scale) +
+                          torch.log2(gt_bboxes_wh[:, 1] / scale)) / 2).int()
+        gt_bboxes_lvl = torch.clamp(gt_bboxes_lvl, min=lvl_min, max=lvl_max)
+
+        # stores the assigned gt index of each point
+        assigned_gt_inds = points.new_zeros((num_points, ), dtype=torch.long)
+        # stores the assigned gt dist (to this point) of each point
+        assigned_gt_dist = points.new_full((num_points, ), float('inf'))
+        points_range = torch.arange(points.shape[0])
+
+        for idx in range(num_gts):
+            gt_lvl = gt_bboxes_lvl[idx]
+            # get the index of points in this level
+            lvl_idx = gt_lvl == points_lvl
+            points_index = points_range[lvl_idx]
+            # get the points in this level
+            lvl_points = points_xy[lvl_idx, :]
+            # get the center point of gt
+            gt_point = gt_bboxes_xy[[idx], :]
+            # get width and height of gt
+            gt_wh = gt_bboxes_wh[[idx], :]
+            # compute the distance between gt center and
+            #   all points in this level
+            points_gt_dist = ((lvl_points - gt_point) / gt_wh).norm(dim=1)
+            # find the nearest k points to gt center in this level
+            min_dist, min_dist_index = torch.topk(
+                points_gt_dist, self.pos_num, largest=False)
+            # the index of nearest k points to gt center in this level
+            min_dist_points_index = points_index[min_dist_index]
+            # The less_than_recorded_index stores the index
+            #   of min_dist that is less then the assigned_gt_dist. Where
+            #   assigned_gt_dist stores the dist from previous assigned gt
+            #   (if exist) to each point.
+            less_than_recorded_index = min_dist < assigned_gt_dist[
+                min_dist_points_index]
+            # The min_dist_points_index stores the index of points satisfy:
+            #   (1) it is k nearest to current gt center in this level.
+            #   (2) it is closer to current gt center than other gt center.
+            min_dist_points_index = min_dist_points_index[
+                less_than_recorded_index]
+            # assign the result
+            assigned_gt_inds[min_dist_points_index] = idx + 1
+            assigned_gt_dist[min_dist_points_index] = min_dist[
+                less_than_recorded_index]
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_points, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmdet/core/bbox/assigners/region_assigner.py b/mmdet/core/bbox/assigners/region_assigner.py
new file mode 100755
index 0000000..1833b89
--- /dev/null
+++ b/mmdet/core/bbox/assigners/region_assigner.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import anchor_inside_flags
+from ..builder import BBOX_ASSIGNERS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def calc_region(bbox, ratio, stride, featmap_size=None):
+    """Calculate region of the box defined by the ratio, the ratio is from the
+    center of the box to every edge."""
+    # project bbox on the feature
+    f_bbox = bbox / stride
+    x1 = torch.round((1 - ratio) * f_bbox[0] + ratio * f_bbox[2])
+    y1 = torch.round((1 - ratio) * f_bbox[1] + ratio * f_bbox[3])
+    x2 = torch.round(ratio * f_bbox[0] + (1 - ratio) * f_bbox[2])
+    y2 = torch.round(ratio * f_bbox[1] + (1 - ratio) * f_bbox[3])
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
+
+
+def anchor_ctr_inside_region_flags(anchors, stride, region):
+    """Get the flag indicate whether anchor centers are inside regions."""
+    x1, y1, x2, y2 = region
+    f_anchors = anchors / stride
+    x = (f_anchors[:, 0] + f_anchors[:, 2]) * 0.5
+    y = (f_anchors[:, 1] + f_anchors[:, 3]) * 0.5
+    flags = (x >= x1) & (x <= x2) & (y >= y1) & (y <= y2)
+    return flags
+
+
+@BBOX_ASSIGNERS.register_module()
+class RegionAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        center_ratio: ratio of the region in the center of the bbox to
+            define positive sample.
+        ignore_ratio: ratio of the region to define ignore samples.
+    """
+
+    def __init__(self, center_ratio=0.2, ignore_ratio=0.5):
+        self.center_ratio = center_ratio
+        self.ignore_ratio = ignore_ratio
+
+    def assign(self,
+               mlvl_anchors,
+               mlvl_valid_flags,
+               gt_bboxes,
+               img_meta,
+               featmap_sizes,
+               anchor_scale,
+               anchor_strides,
+               gt_bboxes_ignore=None,
+               gt_labels=None,
+               allowed_border=0):
+        """Assign gt to anchors.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+
+        The assignment is done in following steps, and the order matters.
+
+        1. Assign every anchor to 0 (negative)
+        2. (For each gt_bboxes) Compute ignore flags based on ignore_region
+           then assign -1 to anchors w.r.t. ignore flags
+        3. (For each gt_bboxes) Compute pos flags based on center_region then
+           assign gt_bboxes to anchors w.r.t. pos flags
+        4. (For each gt_bboxes) Compute ignore flags based on adjacent anchor
+           level then assign -1 to anchors w.r.t. ignore flags
+        5. Assign anchor outside of image to -1
+
+        Args:
+            mlvl_anchors (list[Tensor]): Multi level anchors.
+            mlvl_valid_flags (list[Tensor]): Multi level valid flags.
+            gt_bboxes (Tensor): Ground truth bboxes of image
+            img_meta (dict): Meta info of image.
+            featmap_sizes (list[Tensor]): Feature mapsize each level
+            anchor_scale (int): Scale of the anchor.
+            anchor_strides (list[int]): Stride of the anchor.
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+            allowed_border (int, optional): The border to allow the valid
+                anchor. Defaults to 0.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        if gt_bboxes_ignore is not None:
+            raise NotImplementedError
+
+        num_gts = gt_bboxes.shape[0]
+        num_bboxes = sum(x.shape[0] for x in mlvl_anchors)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = gt_bboxes.new_zeros((num_bboxes, ))
+            assigned_gt_inds = gt_bboxes.new_zeros((num_bboxes, ),
+                                                   dtype=torch.long)
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = gt_bboxes.new_full((num_bboxes, ),
+                                                     -1,
+                                                     dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        num_lvls = len(mlvl_anchors)
+        r1 = (1 - self.center_ratio) / 2
+        r2 = (1 - self.ignore_ratio) / 2
+
+        scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                           (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+        min_anchor_size = scale.new_full(
+            (1, ), float(anchor_scale * anchor_strides[0]))
+        target_lvls = torch.floor(
+            torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+        target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+
+        # 1. assign 0 (negative) by default
+        mlvl_assigned_gt_inds = []
+        mlvl_ignore_flags = []
+        for lvl in range(num_lvls):
+            h, w = featmap_sizes[lvl]
+            assert h * w == mlvl_anchors[lvl].shape[0]
+            assigned_gt_inds = gt_bboxes.new_full((h * w, ),
+                                                  0,
+                                                  dtype=torch.long)
+            ignore_flags = torch.zeros_like(assigned_gt_inds)
+            mlvl_assigned_gt_inds.append(assigned_gt_inds)
+            mlvl_ignore_flags.append(ignore_flags)
+
+        for gt_id in range(num_gts):
+            lvl = target_lvls[gt_id].item()
+            featmap_size = featmap_sizes[lvl]
+            stride = anchor_strides[lvl]
+            anchors = mlvl_anchors[lvl]
+            gt_bbox = gt_bboxes[gt_id, :4]
+
+            # Compute regions
+            ignore_region = calc_region(gt_bbox, r2, stride, featmap_size)
+            ctr_region = calc_region(gt_bbox, r1, stride, featmap_size)
+
+            # 2. Assign -1 to ignore flags
+            ignore_flags = anchor_ctr_inside_region_flags(
+                anchors, stride, ignore_region)
+            mlvl_assigned_gt_inds[lvl][ignore_flags] = -1
+
+            # 3. Assign gt_bboxes to pos flags
+            pos_flags = anchor_ctr_inside_region_flags(anchors, stride,
+                                                       ctr_region)
+            mlvl_assigned_gt_inds[lvl][pos_flags] = gt_id + 1
+
+            # 4. Assign -1 to ignore adjacent lvl
+            if lvl > 0:
+                d_lvl = lvl - 1
+                d_anchors = mlvl_anchors[d_lvl]
+                d_featmap_size = featmap_sizes[d_lvl]
+                d_stride = anchor_strides[d_lvl]
+                d_ignore_region = calc_region(gt_bbox, r2, d_stride,
+                                              d_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    d_anchors, d_stride, d_ignore_region)
+                mlvl_ignore_flags[d_lvl][ignore_flags] = 1
+            if lvl < num_lvls - 1:
+                u_lvl = lvl + 1
+                u_anchors = mlvl_anchors[u_lvl]
+                u_featmap_size = featmap_sizes[u_lvl]
+                u_stride = anchor_strides[u_lvl]
+                u_ignore_region = calc_region(gt_bbox, r2, u_stride,
+                                              u_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    u_anchors, u_stride, u_ignore_region)
+                mlvl_ignore_flags[u_lvl][ignore_flags] = 1
+
+        # 4. (cont.) Assign -1 to ignore adjacent lvl
+        for lvl in range(num_lvls):
+            ignore_flags = mlvl_ignore_flags[lvl]
+            mlvl_assigned_gt_inds[lvl][ignore_flags] = -1
+
+        # 5. Assign -1 to anchor outside of image
+        flat_assigned_gt_inds = torch.cat(mlvl_assigned_gt_inds)
+        flat_anchors = torch.cat(mlvl_anchors)
+        flat_valid_flags = torch.cat(mlvl_valid_flags)
+        assert (flat_assigned_gt_inds.shape[0] == flat_anchors.shape[0] ==
+                flat_valid_flags.shape[0])
+        inside_flags = anchor_inside_flags(flat_anchors, flat_valid_flags,
+                                           img_meta['img_shape'],
+                                           allowed_border)
+        outside_flags = ~inside_flags
+        flat_assigned_gt_inds[outside_flags] = -1
+
+        if gt_labels is not None:
+            assigned_labels = torch.zeros_like(flat_assigned_gt_inds)
+            pos_flags = assigned_gt_inds > 0
+            assigned_labels[pos_flags] = gt_labels[
+                flat_assigned_gt_inds[pos_flags] - 1]
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, flat_assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmdet/core/bbox/assigners/sim_ota_assigner.py b/mmdet/core/bbox/assigners/sim_ota_assigner.py
new file mode 100755
index 0000000..58bfef4
--- /dev/null
+++ b/mmdet/core/bbox/assigners/sim_ota_assigner.py
@@ -0,0 +1,257 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn.functional as F
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import bbox_overlaps
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class SimOTAAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth.
+
+    Args:
+        center_radius (int | float, optional): Ground truth center size
+            to judge whether a prior is in center. Default 2.5.
+        candidate_topk (int, optional): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k. Default 10.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 3.0.
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+    """
+
+    def __init__(self,
+                 center_radius=2.5,
+                 candidate_topk=10,
+                 iou_weight=3.0,
+                 cls_weight=1.0):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+
+    def assign(self,
+               pred_scores,
+               priors,
+               decoded_bboxes,
+               gt_bboxes,
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Assign gt to priors using SimOTA. It will switch to CPU mode when
+        GPU is out of memory.
+        Args:
+            pred_scores (Tensor): Classification scores of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            decoded_bboxes (Tensor): Predicted bboxes, a 2D-Tensor with shape
+                [num_priors, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_bboxes (Tensor): Ground truth bboxes of one image, a 2D-Tensor
+                with shape [num_gts, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth labels of one image, a Tensor
+                with shape [num_gts].
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            eps (float): A value added to the denominator for numerical
+                stability. Default 1e-7.
+        Returns:
+            assign_result (obj:`AssignResult`): The assigned result.
+        """
+        try:
+            assign_result = self._assign(pred_scores, priors, decoded_bboxes,
+                                         gt_bboxes, gt_labels,
+                                         gt_bboxes_ignore, eps)
+            return assign_result
+        except RuntimeError:
+            origin_device = pred_scores.device
+            warnings.warn('OOM RuntimeError is raised due to the huge memory '
+                          'cost during label assignment. CPU mode is applied '
+                          'in this batch. If you want to avoid this issue, '
+                          'try to reduce the batch size or image size.')
+            torch.cuda.empty_cache()
+
+            pred_scores = pred_scores.cpu()
+            priors = priors.cpu()
+            decoded_bboxes = decoded_bboxes.cpu()
+            gt_bboxes = gt_bboxes.cpu().float()
+            gt_labels = gt_labels.cpu()
+
+            assign_result = self._assign(pred_scores, priors, decoded_bboxes,
+                                         gt_bboxes, gt_labels,
+                                         gt_bboxes_ignore, eps)
+            assign_result.gt_inds = assign_result.gt_inds.to(origin_device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(
+                origin_device)
+            assign_result.labels = assign_result.labels.to(origin_device)
+
+            return assign_result
+
+    def _assign(self,
+                pred_scores,
+                priors,
+                decoded_bboxes,
+                gt_bboxes,
+                gt_labels,
+                gt_bboxes_ignore=None,
+                eps=1e-7):
+        """Assign gt to priors using SimOTA.
+        Args:
+            pred_scores (Tensor): Classification scores of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            decoded_bboxes (Tensor): Predicted bboxes, a 2D-Tensor with shape
+                [num_priors, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_bboxes (Tensor): Ground truth bboxes of one image, a 2D-Tensor
+                with shape [num_gts, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth labels of one image, a Tensor
+                with shape [num_gts].
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            eps (float): A value added to the denominator for numerical
+                stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        INF = 100000.0
+        num_gt = gt_bboxes.size(0)
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            priors, gt_bboxes)
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+
+        if num_gt == 0 or num_bboxes == 0 or num_valid == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                          -1,
+                                                          dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        pairwise_ious = bbox_overlaps(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + eps)
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+        cls_cost = (
+            F.binary_cross_entropy(
+                valid_pred_scores.to(dtype=torch.float32).sqrt_(),
+                gt_onehot_label,
+                reduction='none',
+            ).sum(-1).to(dtype=valid_pred_scores.dtype))
+
+        cost_matrix = (
+            cls_cost * self.cls_weight + iou_cost * self.iou_weight +
+            (~is_in_boxes_and_center) * INF)
+
+        matched_pred_ious, matched_gt_inds = \
+            self.dynamic_k_matching(
+                cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def get_in_gt_and_in_center_info(self, priors, gt_bboxes):
+        num_gt = gt_bboxes.size(0)
+
+        repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt)
+        repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt)
+
+        # is prior centers in gt bboxes, shape: [n_prior, n_gt]
+        l_ = repeated_x - gt_bboxes[:, 0]
+        t_ = repeated_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - repeated_x
+        b_ = gt_bboxes[:, 3] - repeated_y
+
+        deltas = torch.stack([l_, t_, r_, b_], dim=1)
+        is_in_gts = deltas.min(dim=1).values > 0
+        is_in_gts_all = is_in_gts.sum(dim=1) > 0
+
+        # is prior centers in gt centers
+        gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_box_l = gt_cxs - self.center_radius * repeated_stride_x
+        ct_box_t = gt_cys - self.center_radius * repeated_stride_y
+        ct_box_r = gt_cxs + self.center_radius * repeated_stride_x
+        ct_box_b = gt_cys + self.center_radius * repeated_stride_y
+
+        cl_ = repeated_x - ct_box_l
+        ct_ = repeated_y - ct_box_t
+        cr_ = ct_box_r - repeated_x
+        cb_ = ct_box_b - repeated_y
+
+        ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1)
+        is_in_cts = ct_deltas.min(dim=1).values > 0
+        is_in_cts_all = is_in_cts.sum(dim=1) > 0
+
+        # in boxes or in centers, shape: [num_priors]
+        is_in_gts_or_centers = is_in_gts_all | is_in_cts_all
+
+        # both in boxes and centers, shape: [num_fg, num_gt]
+        is_in_boxes_and_centers = (
+            is_in_gts[is_in_gts_or_centers, :]
+            & is_in_cts[is_in_gts_or_centers, :])
+        return is_in_gts_or_centers, is_in_boxes_and_centers
+
+    def dynamic_k_matching(self, cost, pairwise_ious, num_gt, valid_mask):
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/mmdet/core/bbox/assigners/task_aligned_assigner.py b/mmdet/core/bbox/assigners/task_aligned_assigner.py
new file mode 100755
index 0000000..1872de4
--- /dev/null
+++ b/mmdet/core/bbox/assigners/task_aligned_assigner.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000000
+
+
+@BBOX_ASSIGNERS.register_module()
+class TaskAlignedAssigner(BaseAssigner):
+    """Task aligned assigner used in the paper:
+    `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    Assign a corresponding gt bbox or background to each predicted bbox.
+    Each bbox will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        topk (int): number of bbox selected in each level
+        iou_calculator (dict): Config dict for iou calculator.
+            Default: dict(type='BboxOverlaps2D')
+    """
+
+    def __init__(self, topk, iou_calculator=dict(type='BboxOverlaps2D')):
+        assert topk >= 1
+        self.topk = topk
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self,
+               pred_scores,
+               decode_bboxes,
+               anchors,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None,
+               alpha=1,
+               beta=6):
+        """Assign gt to bboxes.
+
+        The assignment is done in following steps
+
+        1. compute alignment metric between all bbox (bbox of all pyramid
+           levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free
+           detector only can predict positive distance)
+
+
+        Args:
+            pred_scores (Tensor): predicted class probability,
+                shape(n, num_classes)
+            decode_bboxes (Tensor): predicted bounding boxes, shape(n, 4)
+            anchors (Tensor): pre-defined anchors, shape(n, 4).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`TaskAlignedAssignResult`: The assign result.
+        """
+        anchors = anchors[:, :4]
+        num_gt, num_bboxes = gt_bboxes.size(0), anchors.size(0)
+        # compute alignment metric between all bbox and gt
+        overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach()
+        bbox_scores = pred_scores[:, gt_labels].detach()
+        # assign 0 by default
+        assigned_gt_inds = anchors.new_full((num_bboxes, ),
+                                            0,
+                                            dtype=torch.long)
+        assign_metrics = anchors.new_zeros((num_bboxes, ))
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = anchors.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No gt boxes, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = anchors.new_full((num_bboxes, ),
+                                                   -1,
+                                                   dtype=torch.long)
+            assign_result = AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+            assign_result.assign_metrics = assign_metrics
+            return assign_result
+
+        # select top-k bboxes as candidates for each gt
+        alignment_metrics = bbox_scores**alpha * overlaps**beta
+        topk = min(self.topk, alignment_metrics.size(0))
+        _, candidate_idxs = alignment_metrics.topk(topk, dim=0, largest=True)
+        candidate_metrics = alignment_metrics[candidate_idxs,
+                                              torch.arange(num_gt)]
+        is_pos = candidate_metrics > 0
+
+        # limit the positive sample's center in gt
+        anchors_cx = (anchors[:, 0] + anchors[:, 2]) / 2.0
+        anchors_cy = (anchors[:, 1] + anchors[:, 3]) / 2.0
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_anchors_cx = anchors_cx.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        ep_anchors_cy = anchors_cy.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_anchors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_anchors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_anchors_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_anchors_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+        assign_metrics[max_overlaps != -INF] = alignment_metrics[
+            max_overlaps != -INF, argmax_overlaps[max_overlaps != -INF]]
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+        assign_result = AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+        assign_result.assign_metrics = assign_metrics
+        return assign_result
diff --git a/mmdet/core/bbox/assigners/uniform_assigner.py b/mmdet/core/bbox/assigners/uniform_assigner.py
new file mode 100755
index 0000000..70294fc
--- /dev/null
+++ b/mmdet/core/bbox/assigners/uniform_assigner.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from ..transforms import bbox_xyxy_to_cxcywh
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class UniformAssigner(BaseAssigner):
+    """Uniform Matching between the anchors and gt boxes, which can achieve
+    balance in positive anchors, and gt_bboxes_ignore was not considered for
+    now.
+
+    Args:
+        pos_ignore_thr (float): the threshold to ignore positive anchors
+        neg_ignore_thr (float): the threshold to ignore negative anchors
+        match_times(int): Number of positive anchors for each gt box.
+           Default 4.
+        iou_calculator (dict): iou_calculator config
+    """
+
+    def __init__(self,
+                 pos_ignore_thr,
+                 neg_ignore_thr,
+                 match_times=4,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.match_times = match_times
+        self.pos_ignore_thr = pos_ignore_thr
+        self.neg_ignore_thr = neg_ignore_thr
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self,
+               bbox_pred,
+               anchor,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None):
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              0,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            assign_result = AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+            assign_result.set_extra_property(
+                'pos_idx', bbox_pred.new_empty(0, dtype=torch.bool))
+            assign_result.set_extra_property('pos_predicted_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            assign_result.set_extra_property('target_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            return assign_result
+
+        # 2. Compute the L1 cost between boxes
+        # Note that we use anchors and predict boxes both
+        cost_bbox = torch.cdist(
+            bbox_xyxy_to_cxcywh(bbox_pred),
+            bbox_xyxy_to_cxcywh(gt_bboxes),
+            p=1)
+        cost_bbox_anchors = torch.cdist(
+            bbox_xyxy_to_cxcywh(anchor), bbox_xyxy_to_cxcywh(gt_bboxes), p=1)
+
+        # We found that topk function has different results in cpu and
+        # cuda mode. In order to ensure consistency with the source code,
+        # we also use cpu mode.
+        # TODO: Check whether the performance of cpu and cuda are the same.
+        C = cost_bbox.cpu()
+        C1 = cost_bbox_anchors.cpu()
+
+        # self.match_times x n
+        index = torch.topk(
+            C,  # c=b,n,x c[i]=n,x
+            k=self.match_times,
+            dim=0,
+            largest=False)[1]
+
+        # self.match_times x n
+        index1 = torch.topk(C1, k=self.match_times, dim=0, largest=False)[1]
+        # (self.match_times*2) x n
+        indexes = torch.cat((index, index1),
+                            dim=1).reshape(-1).to(bbox_pred.device)
+
+        pred_overlaps = self.iou_calculator(bbox_pred, gt_bboxes)
+        anchor_overlaps = self.iou_calculator(anchor, gt_bboxes)
+        pred_max_overlaps, _ = pred_overlaps.max(dim=1)
+        anchor_max_overlaps, _ = anchor_overlaps.max(dim=0)
+
+        # 3. Compute the ignore indexes use gt_bboxes and predict boxes
+        ignore_idx = pred_max_overlaps > self.neg_ignore_thr
+        assigned_gt_inds[ignore_idx] = -1
+
+        # 4. Compute the ignore indexes of positive sample use anchors
+        # and predict boxes
+        pos_gt_index = torch.arange(
+            0, C1.size(1),
+            device=bbox_pred.device).repeat(self.match_times * 2)
+        pos_ious = anchor_overlaps[indexes, pos_gt_index]
+        pos_ignore_idx = pos_ious < self.pos_ignore_thr
+
+        pos_gt_index_with_ignore = pos_gt_index + 1
+        pos_gt_index_with_ignore[pos_ignore_idx] = -1
+        assigned_gt_inds[indexes] = pos_gt_index_with_ignore
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        assign_result = AssignResult(
+            num_gts,
+            assigned_gt_inds,
+            anchor_max_overlaps,
+            labels=assigned_labels)
+        assign_result.set_extra_property('pos_idx', ~pos_ignore_idx)
+        assign_result.set_extra_property('pos_predicted_boxes',
+                                         bbox_pred[indexes])
+        assign_result.set_extra_property('target_boxes',
+                                         gt_bboxes[pos_gt_index])
+        return assign_result
diff --git a/mmdet/core/bbox/builder.py b/mmdet/core/bbox/builder.py
new file mode 100755
index 0000000..9cfa055
--- /dev/null
+++ b/mmdet/core/bbox/builder.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build_from_cfg(cfg, BBOX_ASSIGNERS, default_args)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    return build_from_cfg(cfg, BBOX_CODERS, default_args)
diff --git a/mmdet/core/bbox/coder/__init__.py b/mmdet/core/bbox/coder/__init__.py
new file mode 100755
index 0000000..e12fd64
--- /dev/null
+++ b/mmdet/core/bbox/coder/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_bbox_coder import BaseBBoxCoder
+from .bucketing_bbox_coder import BucketingBBoxCoder
+from .delta_xywh_bbox_coder import DeltaXYWHBBoxCoder
+from .distance_point_bbox_coder import DistancePointBBoxCoder
+from .legacy_delta_xywh_bbox_coder import LegacyDeltaXYWHBBoxCoder
+from .pseudo_bbox_coder import PseudoBBoxCoder
+from .tblr_bbox_coder import TBLRBBoxCoder
+from .yolo_bbox_coder import YOLOBBoxCoder
+
+__all__ = [
+    'BaseBBoxCoder', 'PseudoBBoxCoder', 'DeltaXYWHBBoxCoder',
+    'LegacyDeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'YOLOBBoxCoder',
+    'BucketingBBoxCoder', 'DistancePointBBoxCoder'
+]
diff --git a/mmdet/core/bbox/coder/base_bbox_coder.py b/mmdet/core/bbox/coder/base_bbox_coder.py
new file mode 100755
index 0000000..a7ed041
--- /dev/null
+++ b/mmdet/core/bbox/coder/base_bbox_coder.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def encode(self, bboxes, gt_bboxes):
+        """Encode deltas between bboxes and ground truth boxes."""
+
+    @abstractmethod
+    def decode(self, bboxes, bboxes_pred):
+        """Decode the predicted bboxes according to prediction and base
+        boxes."""
diff --git a/mmdet/core/bbox/coder/bucketing_bbox_coder.py b/mmdet/core/bbox/coder/bucketing_bbox_coder.py
new file mode 100755
index 0000000..4be0ada
--- /dev/null
+++ b/mmdet/core/bbox/coder/bucketing_bbox_coder.py
@@ -0,0 +1,351 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ..builder import BBOX_CODERS
+from ..transforms import bbox_rescale
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class BucketingBBoxCoder(BaseBBoxCoder):
+    """Bucketing BBox Coder for Side-Aware Boundary Localization (SABL).
+
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented here.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_buckets (int): Number of buckets.
+        scale_factor (int): Scale factor of proposals to generate buckets.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset upperbound to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_buckets,
+                 scale_factor,
+                 offset_topk=2,
+                 offset_upperbound=1.0,
+                 cls_ignore_neighbor=True,
+                 clip_border=True):
+        super(BucketingBBoxCoder, self).__init__()
+        self.num_buckets = num_buckets
+        self.scale_factor = scale_factor
+        self.offset_topk = offset_topk
+        self.offset_upperbound = offset_upperbound
+        self.cls_ignore_neighbor = cls_ignore_neighbor
+        self.clip_border = clip_border
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get bucketing estimation and fine regression targets during
+        training.
+
+        Args:
+            bboxes (torch.Tensor): source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): target of the transformation, e.g.,
+                ground truth boxes.
+
+        Returns:
+           encoded_bboxes(tuple[Tensor]): bucketing estimation
+            and fine regression targets and weights
+        """
+
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2bucket(bboxes, gt_bboxes, self.num_buckets,
+                                     self.scale_factor, self.offset_topk,
+                                     self.offset_upperbound,
+                                     self.cls_ignore_neighbor)
+        return encoded_bboxes
+
+    def decode(self, bboxes, pred_bboxes, max_shape=None):
+        """Apply transformation `pred_bboxes` to `boxes`.
+        Args:
+            boxes (torch.Tensor): Basic boxes.
+            pred_bboxes (torch.Tensor): Predictions for bucketing estimation
+                and fine regression
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert len(pred_bboxes) == 2
+        cls_preds, offset_preds = pred_bboxes
+        assert cls_preds.size(0) == bboxes.size(0) and offset_preds.size(
+            0) == bboxes.size(0)
+        decoded_bboxes = bucket2bbox(bboxes, cls_preds, offset_preds,
+                                     self.num_buckets, self.scale_factor,
+                                     max_shape, self.clip_border)
+
+        return decoded_bboxes
+
+
+@mmcv.jit(coderize=True)
+def generat_buckets(proposals, num_buckets, scale_factor=1.0):
+    """Generate buckets w.r.t bucket number and scale factor of proposals.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+
+    Returns:
+        tuple[Tensor]: (bucket_w, bucket_h, l_buckets, r_buckets,
+         t_buckets, d_buckets)
+
+            - bucket_w: Width of buckets on x-axis. Shape (n, ).
+            - bucket_h: Height of buckets on y-axis. Shape (n, ).
+            - l_buckets: Left buckets. Shape (n, ceil(side_num/2)).
+            - r_buckets: Right buckets. Shape (n, ceil(side_num/2)).
+            - t_buckets: Top buckets. Shape (n, ceil(side_num/2)).
+            - d_buckets: Down buckets. Shape (n, ceil(side_num/2)).
+    """
+    proposals = bbox_rescale(proposals, scale_factor)
+
+    # number of buckets in each side
+    side_num = int(np.ceil(num_buckets / 2.0))
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+    px1 = proposals[..., 0]
+    py1 = proposals[..., 1]
+    px2 = proposals[..., 2]
+    py2 = proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    # left buckets
+    l_buckets = px1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # right buckets
+    r_buckets = px2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # top buckets
+    t_buckets = py1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    # down buckets
+    d_buckets = py2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    return bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, d_buckets
+
+
+@mmcv.jit(coderize=True)
+def bbox2bucket(proposals,
+                gt,
+                num_buckets,
+                scale_factor,
+                offset_topk=2,
+                offset_upperbound=1.0,
+                cls_ignore_neighbor=True):
+    """Generate buckets estimation and fine regression targets.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        gt (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset allowance to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (offsets, offsets_weights, bucket_labels, cls_weights).
+
+            - offsets: Fine regression targets. \
+                Shape (n, num_buckets*2).
+            - offsets_weights: Fine regression weights. \
+                Shape (n, num_buckets*2).
+            - bucket_labels: Bucketing estimation labels. \
+                Shape (n, num_buckets*2).
+            - cls_weights: Bucketing estimation weights. \
+                Shape (n, num_buckets*2).
+    """
+    assert proposals.size() == gt.size()
+
+    # generate buckets
+    proposals = proposals.float()
+    gt = gt.float()
+    (bucket_w, bucket_h, l_buckets, r_buckets, t_buckets,
+     d_buckets) = generat_buckets(proposals, num_buckets, scale_factor)
+
+    gx1 = gt[..., 0]
+    gy1 = gt[..., 1]
+    gx2 = gt[..., 2]
+    gy2 = gt[..., 3]
+
+    # generate offset targets and weights
+    # offsets from buckets to gts
+    l_offsets = (l_buckets - gx1[:, None]) / bucket_w[:, None]
+    r_offsets = (r_buckets - gx2[:, None]) / bucket_w[:, None]
+    t_offsets = (t_buckets - gy1[:, None]) / bucket_h[:, None]
+    d_offsets = (d_buckets - gy2[:, None]) / bucket_h[:, None]
+
+    # select top-k nearest buckets
+    l_topk, l_label = l_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    r_topk, r_label = r_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    t_topk, t_label = t_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    d_topk, d_label = d_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+
+    offset_l_weights = l_offsets.new_zeros(l_offsets.size())
+    offset_r_weights = r_offsets.new_zeros(r_offsets.size())
+    offset_t_weights = t_offsets.new_zeros(t_offsets.size())
+    offset_d_weights = d_offsets.new_zeros(d_offsets.size())
+    inds = torch.arange(0, proposals.size(0)).to(proposals).long()
+
+    # generate offset weights of top-k nearest buckets
+    for k in range(offset_topk):
+        if k >= 1:
+            offset_l_weights[inds, l_label[:,
+                                           k]] = (l_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_r_weights[inds, r_label[:,
+                                           k]] = (r_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_t_weights[inds, t_label[:,
+                                           k]] = (t_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_d_weights[inds, d_label[:,
+                                           k]] = (d_topk[:, k] <
+                                                  offset_upperbound).float()
+        else:
+            offset_l_weights[inds, l_label[:, k]] = 1.0
+            offset_r_weights[inds, r_label[:, k]] = 1.0
+            offset_t_weights[inds, t_label[:, k]] = 1.0
+            offset_d_weights[inds, d_label[:, k]] = 1.0
+
+    offsets = torch.cat([l_offsets, r_offsets, t_offsets, d_offsets], dim=-1)
+    offsets_weights = torch.cat([
+        offset_l_weights, offset_r_weights, offset_t_weights, offset_d_weights
+    ],
+                                dim=-1)
+
+    # generate bucket labels and weight
+    side_num = int(np.ceil(num_buckets / 2.0))
+    labels = torch.stack(
+        [l_label[:, 0], r_label[:, 0], t_label[:, 0], d_label[:, 0]], dim=-1)
+
+    batch_size = labels.size(0)
+    bucket_labels = F.one_hot(labels.view(-1), side_num).view(batch_size,
+                                                              -1).float()
+    bucket_cls_l_weights = (l_offsets.abs() < 1).float()
+    bucket_cls_r_weights = (r_offsets.abs() < 1).float()
+    bucket_cls_t_weights = (t_offsets.abs() < 1).float()
+    bucket_cls_d_weights = (d_offsets.abs() < 1).float()
+    bucket_cls_weights = torch.cat([
+        bucket_cls_l_weights, bucket_cls_r_weights, bucket_cls_t_weights,
+        bucket_cls_d_weights
+    ],
+                                   dim=-1)
+    # ignore second nearest buckets for cls if necessary
+    if cls_ignore_neighbor:
+        bucket_cls_weights = (~((bucket_cls_weights == 1) &
+                                (bucket_labels == 0))).float()
+    else:
+        bucket_cls_weights[:] = 1.0
+    return offsets, offsets_weights, bucket_labels, bucket_cls_weights
+
+
+@mmcv.jit(coderize=True)
+def bucket2bbox(proposals,
+                cls_preds,
+                offset_preds,
+                num_buckets,
+                scale_factor=1.0,
+                max_shape=None,
+                clip_border=True):
+    """Apply bucketing estimation (cls preds) and fine regression (offset
+    preds) to generate det bboxes.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed. Shape (n, 4)
+        cls_preds (Tensor): bucketing estimation. Shape (n, num_buckets*2).
+        offset_preds (Tensor): fine regression. Shape (n, num_buckets*2).
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (bboxes, loc_confidence).
+
+            - bboxes: predicted bboxes. Shape (n, 4)
+            - loc_confidence: localization confidence of predicted bboxes.
+                Shape (n,).
+    """
+
+    side_num = int(np.ceil(num_buckets / 2.0))
+    cls_preds = cls_preds.view(-1, side_num)
+    offset_preds = offset_preds.view(-1, side_num)
+
+    scores = F.softmax(cls_preds, dim=1)
+    score_topk, score_label = scores.topk(2, dim=1, largest=True, sorted=True)
+
+    rescaled_proposals = bbox_rescale(proposals, scale_factor)
+
+    pw = rescaled_proposals[..., 2] - rescaled_proposals[..., 0]
+    ph = rescaled_proposals[..., 3] - rescaled_proposals[..., 1]
+    px1 = rescaled_proposals[..., 0]
+    py1 = rescaled_proposals[..., 1]
+    px2 = rescaled_proposals[..., 2]
+    py2 = rescaled_proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    score_inds_l = score_label[0::4, 0]
+    score_inds_r = score_label[1::4, 0]
+    score_inds_t = score_label[2::4, 0]
+    score_inds_d = score_label[3::4, 0]
+    l_buckets = px1 + (0.5 + score_inds_l.float()) * bucket_w
+    r_buckets = px2 - (0.5 + score_inds_r.float()) * bucket_w
+    t_buckets = py1 + (0.5 + score_inds_t.float()) * bucket_h
+    d_buckets = py2 - (0.5 + score_inds_d.float()) * bucket_h
+
+    offsets = offset_preds.view(-1, 4, side_num)
+    inds = torch.arange(proposals.size(0)).to(proposals).long()
+    l_offsets = offsets[:, 0, :][inds, score_inds_l]
+    r_offsets = offsets[:, 1, :][inds, score_inds_r]
+    t_offsets = offsets[:, 2, :][inds, score_inds_t]
+    d_offsets = offsets[:, 3, :][inds, score_inds_d]
+
+    x1 = l_buckets - l_offsets * bucket_w
+    x2 = r_buckets - r_offsets * bucket_w
+    y1 = t_buckets - t_offsets * bucket_h
+    y2 = d_buckets - d_offsets * bucket_h
+
+    if clip_border and max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.cat([x1[:, None], y1[:, None], x2[:, None], y2[:, None]],
+                       dim=-1)
+
+    # bucketing guided rescoring
+    loc_confidence = score_topk[:, 0]
+    top2_neighbor_inds = (score_label[:, 0] - score_label[:, 1]).abs() == 1
+    loc_confidence += score_topk[:, 1] * top2_neighbor_inds.float()
+    loc_confidence = loc_confidence.view(-1, 4).mean(dim=1)
+
+    return bboxes, loc_confidence
diff --git a/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py b/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py
new file mode 100755
index 0000000..a7f1c62
--- /dev/null
+++ b/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py
@@ -0,0 +1,392 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class DeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Delta XYWH BBox coder.
+
+    Following the practice in `R-CNN <https://arxiv.org/abs/1311.2524>`_,
+    this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and
+    decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2).
+
+    Args:
+        target_means (Sequence[float]): Denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): Denormalizing standard deviation of
+            target for delta coordinates
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+    """
+
+    def __init__(self,
+                 target_means=(0., 0., 0., 0.),
+                 target_stds=(1., 1., 1., 1.),
+                 clip_border=True,
+                 add_ctr_clamp=False,
+                 ctr_clamp=32):
+        super(BaseBBoxCoder, self).__init__()
+        self.means = target_means
+        self.stds = target_stds
+        self.clip_border = clip_border
+        self.add_ctr_clamp = add_ctr_clamp
+        self.ctr_clamp = ctr_clamp
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor): Source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): Target of the transformation, e.g.,
+                ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(self,
+               bboxes,
+               pred_bboxes,
+               max_shape=None,
+               wh_ratio_clip=16 / 1000):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor): Basic boxes. Shape (B, N, 4) or (N, 4)
+            pred_bboxes (Tensor): Encoded offsets with respect to each roi.
+               Has shape (B, N, num_classes * 4) or (B, N, 4) or
+               (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+               when rois is a grid of anchors.Offset encoding follows [1]_.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        if pred_bboxes.ndim == 3:
+            assert pred_bboxes.size(1) == bboxes.size(1)
+
+        if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
+            # single image decode
+            decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means,
+                                        self.stds, max_shape, wh_ratio_clip,
+                                        self.clip_border, self.add_ctr_clamp,
+                                        self.ctr_clamp)
+        else:
+            if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export():
+                warnings.warn(
+                    'DeprecationWarning: onnx_delta2bbox is deprecated '
+                    'in the case of batch decoding and non-ONNX, '
+                    'please use “delta2bbox” instead. In order to improve '
+                    'the decoding speed, the batch function will no '
+                    'longer be supported. ')
+            decoded_bboxes = onnx_delta2bbox(bboxes, pred_bboxes, self.means,
+                                             self.stds, max_shape,
+                                             wh_ratio_clip, self.clip_border,
+                                             self.add_ctr_clamp,
+                                             self.ctr_clamp)
+
+        return decoded_bboxes
+
+
+@mmcv.jit(coderize=True)
+def bbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)):
+    """Compute deltas of proposals w.r.t. gt.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of :func:`delta2bbox`.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0]
+    gh = gt[..., 3] - gt[..., 1]
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+@mmcv.jit(coderize=True)
+def delta2bbox(rois,
+               deltas,
+               means=(0., 0., 0., 0.),
+               stds=(1., 1., 1., 1.),
+               max_shape=None,
+               wh_ratio_clip=16 / 1000,
+               clip_border=True,
+               add_ctr_clamp=False,
+               ctr_clamp=32):
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4).
+        deltas (Tensor): Encoded offsets relative to each roi.
+            Has shape (N, num_classes * 4) or (N, 4). Note
+            N = num_base_anchors * W * H, when rois is a grid of
+            anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
+           (H, W). Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
+            16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp. When set to True,
+            the center of the prediction bounding box will be clamped to
+            avoid being too far away from the center of the anchor.
+            Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
+           represent tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
+    if num_bboxes == 0:
+        return deltas
+
+    deltas = deltas.reshape(-1, 4)
+
+    means = deltas.new_tensor(means).view(1, -1)
+    stds = deltas.new_tensor(stds).view(1, -1)
+    denorm_deltas = deltas * stds + means
+
+    dxy = denorm_deltas[:, :2]
+    dwh = denorm_deltas[:, 2:]
+
+    # Compute width/height of each roi
+    rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
+    pxy = ((rois_[:, :2] + rois_[:, 2:]) * 0.5)
+    pwh = (rois_[:, 2:] - rois_[:, :2])
+
+    dxy_wh = pwh * dxy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+    x1y1 = gxy - (gwh * 0.5)
+    x2y2 = gxy + (gwh * 0.5)
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0, max=max_shape[1])
+        bboxes[..., 1::2].clamp_(min=0, max=max_shape[0])
+    bboxes = bboxes.reshape(num_bboxes, -1)
+    return bboxes
+
+
+def onnx_delta2bbox(rois,
+                    deltas,
+                    means=(0., 0., 0., 0.),
+                    stds=(1., 1., 1., 1.),
+                    max_shape=None,
+                    wh_ratio_clip=16 / 1000,
+                    clip_border=True,
+                    add_ctr_clamp=False,
+                    ctr_clamp=32):
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (B, N, num_classes * 4) or (B, N, 4) or
+            (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+            when rois is a grid of anchors.Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If rois shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B. Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+            Default 16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or
+           (N, num_classes * 4) or (N, 4), where 4 represent
+           tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    means = deltas.new_tensor(means).view(1,
+                                          -1).repeat(1,
+                                                     deltas.size(-1) // 4)
+    stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[..., 0::4]
+    dy = denorm_deltas[..., 1::4]
+    dw = denorm_deltas[..., 2::4]
+    dh = denorm_deltas[..., 3::4]
+
+    x1, y1 = rois[..., 0], rois[..., 1]
+    x2, y2 = rois[..., 2], rois[..., 3]
+    # Compute center of each roi
+    px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx)
+    py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (x2 - x1).unsqueeze(-1).expand_as(dw)
+    ph = (y2 - y1).unsqueeze(-1).expand_as(dh)
+
+    dx_width = pw * dx
+    dy_height = ph * dy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp)
+        dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp)
+        dw = torch.clamp(dw, max=max_ratio)
+        dh = torch.clamp(dh, max=max_ratio)
+    else:
+        dw = dw.clamp(min=-max_ratio, max=max_ratio)
+        dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + dx_width
+    gy = py + dy_height
+    # Convert center-xy/width/height to top-left, bottom-right
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat(
+            [max_shape] * (deltas.size(-1) // 2),
+            dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
diff --git a/mmdet/core/bbox/coder/distance_point_bbox_coder.py b/mmdet/core/bbox/coder/distance_point_bbox_coder.py
new file mode 100755
index 0000000..9f308a8
--- /dev/null
+++ b/mmdet/core/bbox/coder/distance_point_bbox_coder.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import BBOX_CODERS
+from ..transforms import bbox2distance, distance2bbox
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class DistancePointBBoxCoder(BaseBBoxCoder):
+    """Distance Point BBox coder.
+
+    This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, clip_border=True):
+        super(BaseBBoxCoder, self).__init__()
+        self.clip_border = clip_border
+
+    def encode(self, points, gt_bboxes, max_dis=None, eps=0.1):
+        """Encode bounding box to distances.
+
+        Args:
+            points (Tensor): Shape (N, 2), The format is [x, y].
+            gt_bboxes (Tensor): Shape (N, 4), The format is "xyxy"
+            max_dis (float): Upper bound of the distance. Default None.
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.1.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 4).
+        """
+        assert points.size(0) == gt_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 4
+        return bbox2distance(points, gt_bboxes, max_dis, eps)
+
+    def decode(self, points, pred_bboxes, max_shape=None):
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom). Shape (B, N, 4)
+                or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Tensor: Boxes with shape (N, 4) or (B, N, 4)
+        """
+        assert points.size(0) == pred_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 4
+        if self.clip_border is False:
+            max_shape = None
+        return distance2bbox(points, pred_bboxes, max_shape)
diff --git a/mmdet/core/bbox/coder/legacy_delta_xywh_bbox_coder.py b/mmdet/core/bbox/coder/legacy_delta_xywh_bbox_coder.py
new file mode 100755
index 0000000..7fa348b
--- /dev/null
+++ b/mmdet/core/bbox/coder/legacy_delta_xywh_bbox_coder.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class LegacyDeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Legacy Delta XYWH BBox coder used in MMDet V1.x.
+
+    Following the practice in R-CNN [1]_, this coder encodes bbox (x1, y1, x2,
+    y2) into delta (dx, dy, dw, dh) and decodes delta (dx, dy, dw, dh)
+    back to original bbox (x1, y1, x2, y2).
+
+    Note:
+        The main difference between :class`LegacyDeltaXYWHBBoxCoder` and
+        :class:`DeltaXYWHBBoxCoder` is whether ``+ 1`` is used during width and
+        height calculation. We suggest to only use this coder when testing with
+        MMDet V1.x models.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Args:
+        target_means (Sequence[float]): denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): denormalizing standard deviation of
+            target for delta coordinates
+    """
+
+    def __init__(self,
+                 target_means=(0., 0., 0., 0.),
+                 target_stds=(1., 1., 1., 1.)):
+        super(BaseBBoxCoder, self).__init__()
+        self.means = target_means
+        self.stds = target_stds
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor): source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): target of the transformation, e.g.,
+                ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = legacy_bbox2delta(bboxes, gt_bboxes, self.means,
+                                           self.stds)
+        return encoded_bboxes
+
+    def decode(self,
+               bboxes,
+               pred_bboxes,
+               max_shape=None,
+               wh_ratio_clip=16 / 1000):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor): Basic boxes.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        decoded_bboxes = legacy_delta2bbox(bboxes, pred_bboxes, self.means,
+                                           self.stds, max_shape, wh_ratio_clip)
+
+        return decoded_bboxes
+
+
+@mmcv.jit(coderize=True)
+def legacy_bbox2delta(proposals,
+                      gt,
+                      means=(0., 0., 0., 0.),
+                      stds=(1., 1., 1., 1.)):
+    """Compute deltas of proposals w.r.t. gt in the MMDet V1.x manner.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of `delta2bbox()`
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+@mmcv.jit(coderize=True)
+def legacy_delta2bbox(rois,
+                      deltas,
+                      means=(0., 0., 0., 0.),
+                      stds=(1., 1., 1., 1.),
+                      max_shape=None,
+                      wh_ratio_clip=16 / 1000):
+    """Apply deltas to shift/scale base boxes in the MMDet V1.x manner.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of `bbox2delta()`
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
+            rois is a grid of anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4), where columns represent
+            tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> legacy_delta2bbox(rois, deltas, max_shape=(32, 32))
+        tensor([[0.0000, 0.0000, 1.5000, 1.5000],
+                [0.0000, 0.0000, 5.2183, 5.2183],
+                [0.0000, 0.1321, 7.8891, 0.8679],
+                [5.3967, 2.4251, 6.0033, 3.7749]])
+    """
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Compute center of each roi
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + pw * dx
+    gy = py + ph * dy
+    # Convert center-xy/width/height to top-left, bottom-right
+
+    # The true legacy box coder should +- 0.5 here.
+    # However, current implementation improves the performance when testing
+    # the models trained in MMDetection 1.X (~0.5 bbox AP, 0.2 mask AP)
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
diff --git a/mmdet/core/bbox/coder/pseudo_bbox_coder.py b/mmdet/core/bbox/coder/pseudo_bbox_coder.py
new file mode 100755
index 0000000..fe71f36
--- /dev/null
+++ b/mmdet/core/bbox/coder/pseudo_bbox_coder.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class PseudoBBoxCoder(BaseBBoxCoder):
+    """Pseudo bounding box coder."""
+
+    def __init__(self, **kwargs):
+        super(BaseBBoxCoder, self).__init__(**kwargs)
+
+    def encode(self, bboxes, gt_bboxes):
+        """torch.Tensor: return the given ``bboxes``"""
+        return gt_bboxes
+
+    def decode(self, bboxes, pred_bboxes):
+        """torch.Tensor: return the given ``pred_bboxes``"""
+        return pred_bboxes
diff --git a/mmdet/core/bbox/coder/tblr_bbox_coder.py b/mmdet/core/bbox/coder/tblr_bbox_coder.py
new file mode 100755
index 0000000..cb42066
--- /dev/null
+++ b/mmdet/core/bbox/coder/tblr_bbox_coder.py
@@ -0,0 +1,206 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class TBLRBBoxCoder(BaseBBoxCoder):
+    """TBLR BBox coder.
+
+    Following the practice in `FSAF <https://arxiv.org/abs/1903.00621>`_,
+    this coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        normalizer (list | float): Normalization factor to be
+          divided with when coding the coordinates. If it is a list, it should
+          have length of 4 indicating normalization factor in tblr dims.
+          Otherwise it is a unified float factor for all dims. Default: 4.0
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, normalizer=4.0, clip_border=True):
+        super(BaseBBoxCoder, self).__init__()
+        self.normalizer = normalizer
+        self.clip_border = clip_border
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes`` in the (top, left,
+        bottom, right) order.
+
+        Args:
+            bboxes (torch.Tensor): source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): target of the transformation, e.g.,
+                ground truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bboxes2tblr(
+            bboxes, gt_bboxes, normalizer=self.normalizer)
+        return encoded_bboxes
+
+    def decode(self, bboxes, pred_bboxes, max_shape=None):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor): Basic boxes.Shape (B, N, 4) or (N, 4)
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+               (B, N, 4) or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        decoded_bboxes = tblr2bboxes(
+            bboxes,
+            pred_bboxes,
+            normalizer=self.normalizer,
+            max_shape=max_shape,
+            clip_border=self.clip_border)
+
+        return decoded_bboxes
+
+
+@mmcv.jit(coderize=True)
+def bboxes2tblr(priors, gts, normalizer=4.0, normalize_by_wh=True):
+    """Encode ground truth boxes to tblr coordinate.
+
+    It first convert the gt coordinate to tblr format,
+     (top, bottom, left, right), relative to prior box centers.
+     The tblr coordinate may be normalized by the side length of prior bboxes
+     if `normalize_by_wh` is specified as True, and it is then normalized by
+     the `normalizer` factor.
+
+    Args:
+        priors (Tensor): Prior boxes in point form
+            Shape: (num_proposals,4).
+        gts (Tensor): Coords of ground truth for each prior in point-form
+            Shape: (num_proposals, 4).
+        normalizer (Sequence[float] | float): normalization parameter of
+            encoded boxes. If it is a list, it has to have length = 4.
+            Default: 4.0
+        normalize_by_wh (bool): Whether to normalize tblr coordinate by the
+            side length (wh) of prior bboxes.
+
+    Return:
+        encoded boxes (Tensor), Shape: (num_proposals, 4)
+    """
+
+    # dist b/t match center and prior's center
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == gts.size(0)
+    prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
+    xmin, ymin, xmax, ymax = gts.split(1, dim=1)
+    top = prior_centers[:, 1].unsqueeze(1) - ymin
+    bottom = ymax - prior_centers[:, 1].unsqueeze(1)
+    left = prior_centers[:, 0].unsqueeze(1) - xmin
+    right = xmax - prior_centers[:, 0].unsqueeze(1)
+    loc = torch.cat((top, bottom, left, right), dim=1)
+    if normalize_by_wh:
+        # Normalize tblr by anchor width and height
+        wh = priors[:, 2:4] - priors[:, 0:2]
+        w, h = torch.split(wh, 1, dim=1)
+        loc[:, :2] /= h  # tb is normalized by h
+        loc[:, 2:] /= w  # lr is normalized by w
+    # Normalize tblr by the given normalization factor
+    return loc / normalizer
+
+
+@mmcv.jit(coderize=True)
+def tblr2bboxes(priors,
+                tblr,
+                normalizer=4.0,
+                normalize_by_wh=True,
+                max_shape=None,
+                clip_border=True):
+    """Decode tblr outputs to prediction boxes.
+
+    The process includes 3 steps: 1) De-normalize tblr coordinates by
+    multiplying it with `normalizer`; 2) De-normalize tblr coordinates by the
+    prior bbox width and height if `normalize_by_wh` is `True`; 3) Convert
+    tblr (top, bottom, left, right) pair relative to the center of priors back
+    to (xmin, ymin, xmax, ymax) coordinate.
+
+    Args:
+        priors (Tensor): Prior boxes in point form (x0, y0, x1, y1)
+          Shape: (N,4) or (B, N, 4).
+        tblr (Tensor): Coords of network output in tblr form
+          Shape: (N, 4) or (B, N, 4).
+        normalizer (Sequence[float] | float): Normalization parameter of
+          encoded boxes. By list, it represents the normalization factors at
+          tblr dims. By float, it is the unified normalization factor at all
+          dims. Default: 4.0
+        normalize_by_wh (bool): Whether the tblr coordinates have been
+          normalized by the side length (wh) of prior bboxes.
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Return:
+        encoded boxes (Tensor): Boxes with shape (N, 4) or (B, N, 4)
+    """
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == tblr.size(0)
+    if priors.ndim == 3:
+        assert priors.size(1) == tblr.size(1)
+
+    loc_decode = tblr * normalizer
+    prior_centers = (priors[..., 0:2] + priors[..., 2:4]) / 2
+    if normalize_by_wh:
+        wh = priors[..., 2:4] - priors[..., 0:2]
+        w, h = torch.split(wh, 1, dim=-1)
+        # Inplace operation with slice would failed for exporting to ONNX
+        th = h * loc_decode[..., :2]  # tb
+        tw = w * loc_decode[..., 2:]  # lr
+        loc_decode = torch.cat([th, tw], dim=-1)
+    # Cannot be exported using onnx when loc_decode.split(1, dim=-1)
+    top, bottom, left, right = loc_decode.split((1, 1, 1, 1), dim=-1)
+    xmin = prior_centers[..., 0].unsqueeze(-1) - left
+    xmax = prior_centers[..., 0].unsqueeze(-1) + right
+    ymin = prior_centers[..., 1].unsqueeze(-1) - top
+    ymax = prior_centers[..., 1].unsqueeze(-1) + bottom
+
+    bboxes = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            xmin, ymin, xmax, ymax = dynamic_clip_for_onnx(
+                xmin, ymin, xmax, ymax, max_shape)
+            bboxes = torch.cat([xmin, ymin, xmax, ymax], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = priors.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(priors)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = priors.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
diff --git a/mmdet/core/bbox/coder/yolo_bbox_coder.py b/mmdet/core/bbox/coder/yolo_bbox_coder.py
new file mode 100755
index 0000000..2852eca
--- /dev/null
+++ b/mmdet/core/bbox/coder/yolo_bbox_coder.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class YOLOBBoxCoder(BaseBBoxCoder):
+    """YOLO BBox coder.
+
+    Following `YOLO <https://arxiv.org/abs/1506.02640>`_, this coder divide
+    image into grids, and encode bbox (x1, y1, x2, y2) into (cx, cy, dw, dh).
+    cx, cy in [0., 1.], denotes relative center position w.r.t the center of
+    bboxes. dw, dh are the same as :obj:`DeltaXYWHBBoxCoder`.
+
+    Args:
+        eps (float): Min value of cx, cy when encoding.
+    """
+
+    def __init__(self, eps=1e-6):
+        super(BaseBBoxCoder, self).__init__()
+        self.eps = eps
+
+    @mmcv.jit(coderize=True)
+    def encode(self, bboxes, gt_bboxes, stride):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor): Source boxes, e.g., anchors.
+            gt_bboxes (torch.Tensor): Target of the transformation, e.g.,
+                ground-truth boxes.
+            stride (torch.Tensor | int): Stride of bboxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        x_center_gt = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) * 0.5
+        y_center_gt = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) * 0.5
+        w_gt = gt_bboxes[..., 2] - gt_bboxes[..., 0]
+        h_gt = gt_bboxes[..., 3] - gt_bboxes[..., 1]
+        x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
+        y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
+        w = bboxes[..., 2] - bboxes[..., 0]
+        h = bboxes[..., 3] - bboxes[..., 1]
+        w_target = torch.log((w_gt / w).clamp(min=self.eps))
+        h_target = torch.log((h_gt / h).clamp(min=self.eps))
+        x_center_target = ((x_center_gt - x_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        y_center_target = ((y_center_gt - y_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        encoded_bboxes = torch.stack(
+            [x_center_target, y_center_target, w_target, h_target], dim=-1)
+        return encoded_bboxes
+
+    @mmcv.jit(coderize=True)
+    def decode(self, bboxes, pred_bboxes, stride):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor): Basic boxes, e.g. anchors.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            stride (torch.Tensor | int): Strides of bboxes.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert pred_bboxes.size(-1) == bboxes.size(-1) == 4
+        xy_centers = (bboxes[..., :2] + bboxes[..., 2:]) * 0.5 + (
+            pred_bboxes[..., :2] - 0.5) * stride
+        whs = (bboxes[..., 2:] -
+               bboxes[..., :2]) * 0.5 * pred_bboxes[..., 2:].exp()
+        decoded_bboxes = torch.stack(
+            (xy_centers[..., 0] - whs[..., 0], xy_centers[..., 1] -
+             whs[..., 1], xy_centers[..., 0] + whs[..., 0],
+             xy_centers[..., 1] + whs[..., 1]),
+            dim=-1)
+        return decoded_bboxes
diff --git a/mmdet/core/bbox/demodata.py b/mmdet/core/bbox/demodata.py
new file mode 100755
index 0000000..eb24b34
--- /dev/null
+++ b/mmdet/core/bbox/demodata.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.utils.util_random import ensure_rng
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390
+
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
diff --git a/mmdet/core/bbox/iou_calculators/__init__.py b/mmdet/core/bbox/iou_calculators/__init__.py
new file mode 100755
index 0000000..04ba925
--- /dev/null
+++ b/mmdet/core/bbox/iou_calculators/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_iou_calculator
+from .iou2d_calculator import BboxOverlaps2D, bbox_overlaps
+
+__all__ = ['build_iou_calculator', 'BboxOverlaps2D', 'bbox_overlaps']
diff --git a/mmdet/core/bbox/iou_calculators/builder.py b/mmdet/core/bbox/iou_calculators/builder.py
new file mode 100755
index 0000000..378ee26
--- /dev/null
+++ b/mmdet/core/bbox/iou_calculators/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+IOU_CALCULATORS = Registry('IoU calculator')
+
+
+def build_iou_calculator(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    return build_from_cfg(cfg, IOU_CALCULATORS, default_args)
diff --git a/mmdet/core/bbox/iou_calculators/iou2d_calculator.py b/mmdet/core/bbox/iou_calculators/iou2d_calculator.py
new file mode 100755
index 0000000..b71a555
--- /dev/null
+++ b/mmdet/core/bbox/iou_calculators/iou2d_calculator.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .builder import IOU_CALCULATORS
+
+
+def cast_tensor_type(x, scale=1., dtype=None):
+    if dtype == 'fp16':
+        # scale is for preventing overflows
+        x = (x / scale).half()
+    return x
+
+
+def fp16_clamp(x, min=None, max=None):
+    if not x.is_cuda and x.dtype == torch.float16:
+        # clamp for cpu float16, tensor fp16 has no clamp implementation
+        return x.float().clamp(min, max).half()
+
+    return x.clamp(min, max)
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlaps2D:
+    """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+    def __init__(self, scale=1., dtype=None):
+        self.scale = scale
+        self.dtype = dtype
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor): bboxes have shape (m, 4) in <x1, y1, x2, y2>
+                format, or shape (m, 5) in <x1, y1, x2, y2, score> format.
+            bboxes2 (Tensor): bboxes have shape (n, 4) in <x1, y1, x2, y2>
+                format, shape (n, 5) in <x1, y1, x2, y2, score> format, or be
+                empty.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground), or "giou" (generalized intersection over
+                union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) in [0, 4, 5]
+        assert bboxes2.size(-1) in [0, 4, 5]
+        if bboxes2.size(-1) == 5:
+            bboxes2 = bboxes2[..., :4]
+        if bboxes1.size(-1) == 5:
+            bboxes1 = bboxes1[..., :4]
+
+        if self.dtype == 'fp16':
+            # change tensor type to save cpu and cuda memory and keep speed
+            bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype)
+            bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype)
+            overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+            if not overlaps.is_cuda and overlaps.dtype == torch.float16:
+                # resume cpu float32
+                overlaps = overlaps.float()
+            return overlaps
+
+        return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + f'(' \
+            f'scale={self.scale}, dtype={self.dtype})'
+        return repr_str
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+
+    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
+    Note:
+        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
+        there are some new generated variable when calculating IOU
+        using bbox_overlaps function:
+
+        1) is_aligned is False
+            area1: M x 1
+            area2: N x 1
+            lt: M x N x 2
+            rb: M x N x 2
+            wh: M x N x 2
+            overlap: M x N x 1
+            union: M x N x 1
+            ious: M x N x 1
+
+            Total memory:
+                S = (9 x N x M + N + M) * 4 Byte,
+
+            When using FP16, we can reduce:
+                R = (9 x N x M + N + M) * 4 / 2 Byte
+                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
+                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
+                           N + 1 < 3 * N, when N or M is 1.
+
+            Given M = 40 (ground truth), N = 400000 (three anchor boxes
+            in per grid, FPN, R-CNNs),
+                R = 275 MB (one times)
+
+            A special case (dense detection), M = 512 (ground truth),
+                R = 3516 MB = 3.43 GB
+
+            When the batch size is B, reduce:
+                B x R
+
+            Therefore, CUDA memory runs out frequently.
+
+            Experiments on GeForce RTX 2080Ti (11019 MiB):
+
+            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
+            |:----:|:----:|:----:|:----:|:----:|:----:|
+            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
+            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
+            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
+            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |
+
+        2) is_aligned is True
+            area1: N x 1
+            area2: N x 1
+            lt: N x 2
+            rb: N x 2
+            wh: N x 2
+            overlap: N x 1
+            union: N x 1
+            ious: N x 1
+
+            Total memory:
+                S = 11 x N * 4 Byte
+
+            When using FP16, we can reduce:
+                R = 11 x N * 4 / 2 Byte
+
+        So do the 'giou' (large than 'iou').
+
+        Time-wise, FP16 is generally faster than FP32.
+
+        When gpu_assign_thr is not -1, it takes more time on cpu
+        but not reduce memory.
+        There, we can reduce half the memory and keep the speed.
+
+    If ``is_aligned`` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union), "iof" (intersection over
+            foreground) or "giou" (generalized intersection over union).
+            Default "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/mmdet/core/bbox/match_costs/__init__.py b/mmdet/core/bbox/match_costs/__init__.py
new file mode 100755
index 0000000..1b63679
--- /dev/null
+++ b/mmdet/core/bbox/match_costs/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_match_cost
+from .match_cost import (BBoxL1Cost, ClassificationCost, CrossEntropyLossCost,
+                         DiceCost, FocalLossCost, IoUCost)
+
+__all__ = [
+    'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost',
+    'FocalLossCost', 'DiceCost', 'CrossEntropyLossCost'
+]
diff --git a/mmdet/core/bbox/match_costs/builder.py b/mmdet/core/bbox/match_costs/builder.py
new file mode 100755
index 0000000..ea086ad
--- /dev/null
+++ b/mmdet/core/bbox/match_costs/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+MATCH_COST = Registry('Match Cost')
+
+
+def build_match_cost(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    return build_from_cfg(cfg, MATCH_COST, default_args)
diff --git a/mmdet/core/bbox/match_costs/match_cost.py b/mmdet/core/bbox/match_costs/match_cost.py
new file mode 100755
index 0000000..4342b02
--- /dev/null
+++ b/mmdet/core/bbox/match_costs/match_cost.py
@@ -0,0 +1,359 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmdet.core.bbox.iou_calculators import bbox_overlaps
+from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from .builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBoxL1Cost:
+    """BBoxL1Cost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
+         >>> import torch
+         >>> self = BBoxL1Cost()
+         >>> bbox_pred = torch.rand(1, 4)
+         >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(bbox_pred, gt_bboxes, factor)
+         tensor([[1.6172, 1.6422]])
+    """
+
+    def __init__(self, weight=1., box_format='xyxy'):
+        self.weight = weight
+        assert box_format in ['xyxy', 'xywh']
+        self.box_format = box_format
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                (num_query, 4).
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
+
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        if self.box_format == 'xywh':
+            gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+        elif self.box_format == 'xyxy':
+            bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class FocalLossCost:
+    """FocalLossCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         alpha (int | float, optional): focal_loss alpha
+         gamma (int | float, optional): focal_loss gamma
+         eps (float, optional): default 1e-12
+         binary_input (bool, optional): Whether the input is binary,
+            default False.
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import FocalLossCost
+         >>> import torch
+         >>> self = FocalLossCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3236, -0.3364, -0.2699],
+                [-0.3439, -0.3209, -0.4807],
+                [-0.4099, -0.3795, -0.2929],
+                [-0.1950, -0.1207, -0.2626]])
+    """
+
+    def __init__(self,
+                 weight=1.,
+                 alpha=0.25,
+                 gamma=2,
+                 eps=1e-12,
+                 binary_input=False):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)
+
+
+@MATCH_COST.register_module()
+class ClassificationCost:
+    """ClsSoftmaxCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import \
+         ... ClassificationCost
+         >>> import torch
+         >>> self = ClassificationCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3430, -0.3525, -0.3045],
+                [-0.3077, -0.2931, -0.3992],
+                [-0.3664, -0.3455, -0.2881],
+                [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class IoUCost:
+    """IoUCost.
+
+     Args:
+         iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+         weight (int | float, optional): loss weight
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
+         >>> import torch
+         >>> self = IoUCost()
+         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> self(bboxes, gt_bboxes)
+         tensor([[-0.1250,  0.1667],
+                [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, iou_mode='giou', weight=1.):
+        self.weight = weight
+        self.iou_mode = iou_mode
+
+    def __call__(self, bboxes, gt_bboxes):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape (num_query, 4).
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
+
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        # overlaps: [num_bboxes, num_gt]
+        overlaps = bbox_overlaps(
+            bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class DiceCost:
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        weight (int | float, optional): loss_weight. Defaults to 1.
+        pred_act (bool, optional): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float, optional): default 1e-12.
+        naive_dice (bool, optional): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If Flase, use the second power that
+            is adopted by K-Net and SOLO.
+            Defaults to True.
+    """
+
+    def __init__(self, weight=1., pred_act=False, eps=1e-3, naive_dice=True):
+        self.weight = weight
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def binary_mask_dice_loss(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_query, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_query, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction logits in shape (num_query, *)
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+
+        Returns:
+            Tensor: Dice cost matrix with weight in shape (num_query, num_gt).
+        """
+        if self.pred_act:
+            mask_preds = mask_preds.sigmoid()
+        dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks)
+        return dice_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class CrossEntropyLossCost:
+    """CrossEntropyLossCost.
+
+    Args:
+        weight (int | float, optional): loss weight. Defaults to 1.
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+    Examples:
+         >>> from mmdet.core.bbox.match_costs import CrossEntropyLossCost
+         >>> import torch
+         >>> bce = CrossEntropyLossCost(use_sigmoid=True)
+         >>> cls_pred = torch.tensor([[7.6, 1.2], [-1.3, 10]])
+         >>> gt_labels = torch.tensor([[1, 1], [1, 0]])
+         >>> print(bce(cls_pred, gt_labels))
+    """
+
+    def __init__(self, weight=1., use_sigmoid=True):
+        assert use_sigmoid, 'use_sigmoid = False is not supported yet.'
+        self.weight = weight
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_query, 1, *) or
+                (num_query, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+            gt_labels (Tensor): Labels.
+
+        Returns:
+            Tensor: Cross entropy cost matrix with weight in
+                shape (num_query, num_gt).
+        """
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(cls_pred, gt_labels)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/mmdet/core/bbox/samplers/__init__.py b/mmdet/core/bbox/samplers/__init__.py
new file mode 100755
index 0000000..f58505b
--- /dev/null
+++ b/mmdet/core/bbox/samplers/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_sampler import BaseSampler
+from .combined_sampler import CombinedSampler
+from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
+from .iou_balanced_neg_sampler import IoUBalancedNegSampler
+from .mask_pseudo_sampler import MaskPseudoSampler
+from .mask_sampling_result import MaskSamplingResult
+from .ohem_sampler import OHEMSampler
+from .pseudo_sampler import PseudoSampler
+from .random_sampler import RandomSampler
+from .sampling_result import SamplingResult
+from .score_hlr_sampler import ScoreHLRSampler
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'MaskPseudoSampler',
+    'MaskSamplingResult'
+]
diff --git a/mmdet/core/bbox/samplers/base_sampler.py b/mmdet/core/bbox/samplers/base_sampler.py
new file mode 100755
index 0000000..bd15c7c
--- /dev/null
+++ b/mmdet/core/bbox/samplers/base_sampler.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers."""
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox import RandomSampler
+            >>> from mmdet.core.bbox import AssignResult
+            >>> from mmdet.core.bbox.demodata import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> bboxes = random_boxes(assign_result.num_preds, rng=rng)
+            >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+            >>> gt_labels = None
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/mmdet/core/bbox/samplers/combined_sampler.py b/mmdet/core/bbox/samplers/combined_sampler.py
new file mode 100755
index 0000000..4f6d86f
--- /dev/null
+++ b/mmdet/core/bbox/samplers/combined_sampler.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import BBOX_SAMPLERS, build_sampler
+from .base_sampler import BaseSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class CombinedSampler(BaseSampler):
+    """A sampler that combines positive sampler and negative sampler."""
+
+    def __init__(self, pos_sampler, neg_sampler, **kwargs):
+        super(CombinedSampler, self).__init__(**kwargs)
+        self.pos_sampler = build_sampler(pos_sampler, **kwargs)
+        self.neg_sampler = build_sampler(neg_sampler, **kwargs)
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
diff --git a/mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py b/mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py
new file mode 100755
index 0000000..5e0d9cc
--- /dev/null
+++ b/mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .random_sampler import RandomSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class InstanceBalancedPosSampler(RandomSampler):
+    """Instance balanced sampler that samples equal number of positive samples
+    for each instance."""
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            unique_gt_inds = assign_result.gt_inds[pos_inds].unique()
+            num_gts = len(unique_gt_inds)
+            num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+            sampled_inds = []
+            for i in unique_gt_inds:
+                inds = torch.nonzero(
+                    assign_result.gt_inds == i.item(), as_tuple=False)
+                if inds.numel() != 0:
+                    inds = inds.squeeze(1)
+                else:
+                    continue
+                if len(inds) > num_per_gt:
+                    inds = self.random_choice(inds, num_per_gt)
+                sampled_inds.append(inds)
+            sampled_inds = torch.cat(sampled_inds)
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(
+                    list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                extra_inds = torch.from_numpy(extra_inds).to(
+                    assign_result.gt_inds.device).long()
+                sampled_inds = torch.cat([sampled_inds, extra_inds])
+            elif len(sampled_inds) > num_expected:
+                sampled_inds = self.random_choice(sampled_inds, num_expected)
+            return sampled_inds
diff --git a/mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py b/mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py
new file mode 100755
index 0000000..56e2874
--- /dev/null
+++ b/mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .random_sampler import RandomSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class IoUBalancedNegSampler(RandomSampler):
+    """IoU Balanced Sampling.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Sampling proposals according to their IoU. `floor_fraction` of needed RoIs
+    are sampled from proposals whose IoU are lower than `floor_thr` randomly.
+    The others are sampled from proposals whose IoU are higher than
+    `floor_thr`. These proposals are sampled from some bins evenly, which are
+    split by `num_bins` via IoU evenly.
+
+    Args:
+        num (int): number of proposals.
+        pos_fraction (float): fraction of positive proposals.
+        floor_thr (float): threshold (minimum) IoU for IoU balanced sampling,
+            set to -1 if all using IoU balanced sampling.
+        floor_fraction (float): sampling fraction of proposals under floor_thr.
+        num_bins (int): number of bins in IoU balanced sampling.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 floor_thr=-1,
+                 floor_fraction=0,
+                 num_bins=3,
+                 **kwargs):
+        super(IoUBalancedNegSampler, self).__init__(num, pos_fraction,
+                                                    **kwargs)
+        assert floor_thr >= 0 or floor_thr == -1
+        assert 0 <= floor_fraction <= 1
+        assert num_bins >= 1
+
+        self.floor_thr = floor_thr
+        self.floor_fraction = floor_fraction
+        self.num_bins = num_bins
+
+    def sample_via_interval(self, max_overlaps, full_set, num_expected):
+        """Sample according to the iou interval.
+
+        Args:
+            max_overlaps (torch.Tensor): IoU between bounding boxes and ground
+                truth boxes.
+            full_set (set(int)): A full set of indices of boxes。
+            num_expected (int): Number of expected samples。
+
+        Returns:
+            np.ndarray: Indices  of samples
+        """
+        max_iou = max_overlaps.max()
+        iou_interval = (max_iou - self.floor_thr) / self.num_bins
+        per_num_expected = int(num_expected / self.num_bins)
+
+        sampled_inds = []
+        for i in range(self.num_bins):
+            start_iou = self.floor_thr + i * iou_interval
+            end_iou = self.floor_thr + (i + 1) * iou_interval
+            tmp_set = set(
+                np.where(
+                    np.logical_and(max_overlaps >= start_iou,
+                                   max_overlaps < end_iou))[0])
+            tmp_inds = list(tmp_set & full_set)
+            if len(tmp_inds) > per_num_expected:
+                tmp_sampled_set = self.random_choice(tmp_inds,
+                                                     per_num_expected)
+            else:
+                tmp_sampled_set = np.array(tmp_inds, dtype=np.int)
+            sampled_inds.append(tmp_sampled_set)
+
+        sampled_inds = np.concatenate(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(full_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = self.random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate([sampled_inds, extra_inds])
+
+        return sampled_inds
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected negative samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            max_overlaps = assign_result.max_overlaps.cpu().numpy()
+            # balance sampling for negative samples
+            neg_set = set(neg_inds.cpu().numpy())
+
+            if self.floor_thr > 0:
+                floor_set = set(
+                    np.where(
+                        np.logical_and(max_overlaps >= 0,
+                                       max_overlaps < self.floor_thr))[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps >= self.floor_thr)[0])
+            elif self.floor_thr == 0:
+                floor_set = set(np.where(max_overlaps == 0)[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+            else:
+                floor_set = set()
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+                # for sampling interval calculation
+                self.floor_thr = 0
+
+            floor_neg_inds = list(floor_set & neg_set)
+            iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
+            num_expected_iou_sampling = int(num_expected *
+                                            (1 - self.floor_fraction))
+            if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
+                if self.num_bins >= 2:
+                    iou_sampled_inds = self.sample_via_interval(
+                        max_overlaps, set(iou_sampling_neg_inds),
+                        num_expected_iou_sampling)
+                else:
+                    iou_sampled_inds = self.random_choice(
+                        iou_sampling_neg_inds, num_expected_iou_sampling)
+            else:
+                iou_sampled_inds = np.array(
+                    iou_sampling_neg_inds, dtype=np.int)
+            num_expected_floor = num_expected - len(iou_sampled_inds)
+            if len(floor_neg_inds) > num_expected_floor:
+                sampled_floor_inds = self.random_choice(
+                    floor_neg_inds, num_expected_floor)
+            else:
+                sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int)
+            sampled_inds = np.concatenate(
+                (sampled_floor_inds, iou_sampled_inds))
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(list(neg_set - set(sampled_inds)))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                sampled_inds = np.concatenate((sampled_inds, extra_inds))
+            sampled_inds = torch.from_numpy(sampled_inds).long().to(
+                assign_result.gt_inds.device)
+            return sampled_inds
diff --git a/mmdet/core/bbox/samplers/mask_pseudo_sampler.py b/mmdet/core/bbox/samplers/mask_pseudo_sampler.py
new file mode 100755
index 0000000..b5f6965
--- /dev/null
+++ b/mmdet/core/bbox/samplers/mask_pseudo_sampler.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+
+from mmdet.core.bbox.builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+from .mask_sampling_result import MaskSamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class MaskPseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, masks, gt_masks, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            masks (torch.Tensor): Bounding boxes
+            gt_masks (torch.Tensor): Ground truth boxes
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
+        sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks,
+                                             gt_masks, assign_result, gt_flags)
+        return sampling_result
diff --git a/mmdet/core/bbox/samplers/mask_sampling_result.py b/mmdet/core/bbox/samplers/mask_sampling_result.py
new file mode 100755
index 0000000..3d10943
--- /dev/null
+++ b/mmdet/core/bbox/samplers/mask_sampling_result.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class MaskSamplingResult(SamplingResult):
+    """Mask sampling result."""
+
+    def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_masks = masks[pos_inds]
+        self.neg_masks = masks[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_masks.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_masks.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_masks = torch.empty_like(gt_masks)
+        else:
+            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def masks(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_masks, self.neg_masks])
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_masks'] = data.pop('pos_masks').shape
+        data['neg_masks'] = data.pop('neg_masks').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_masks': self.pos_masks,
+            'neg_masks': self.neg_masks,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
diff --git a/mmdet/core/bbox/samplers/ohem_sampler.py b/mmdet/core/bbox/samplers/ohem_sampler.py
new file mode 100755
index 0000000..7eb0666
--- /dev/null
+++ b/mmdet/core/bbox/samplers/ohem_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from ..transforms import bbox2roi
+from .base_sampler import BaseSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class OHEMSampler(BaseSampler):
+    r"""Online Hard Example Mining Sampler described in `Training Region-based
+    Object Detectors with Online Hard Example Mining
+    <https://arxiv.org/abs/1604.03540>`_.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 loss_key='loss_cls',
+                 **kwargs):
+        super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                          add_gt_as_proposals)
+        self.context = context
+        if not hasattr(self.context, 'num_stages'):
+            self.bbox_head = self.context.bbox_head
+        else:
+            self.bbox_head = self.context.bbox_head[self.context.current_stage]
+
+        self.loss_key = loss_key
+
+    def hard_mining(self, inds, num_expected, bboxes, labels, feats):
+        with torch.no_grad():
+            rois = bbox2roi([bboxes])
+            if not hasattr(self.context, 'num_stages'):
+                bbox_results = self.context._bbox_forward(feats, rois)
+            else:
+                bbox_results = self.context._bbox_forward(
+                    self.context.current_stage, feats, rois)
+            cls_score = bbox_results['cls_score']
+            loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=rois,
+                labels=labels,
+                label_weights=cls_score.new_ones(cls_score.size(0)),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')[self.loss_key]
+            _, topk_loss_inds = loss.topk(num_expected)
+        return inds[topk_loss_inds]
+
+    def _sample_pos(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected positive samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of positive samples
+        """
+        # Sample some hard positive samples
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds],
+                                    assign_result.labels[pos_inds], feats)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected negative samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of negative samples
+        """
+        # Sample some hard negative samples
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            neg_labels = assign_result.labels.new_empty(
+                neg_inds.size(0)).fill_(self.bbox_head.num_classes)
+            return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds],
+                                    neg_labels, feats)
diff --git a/mmdet/core/bbox/samplers/pseudo_sampler.py b/mmdet/core/bbox/samplers/pseudo_sampler.py
new file mode 100755
index 0000000..b5ce298
--- /dev/null
+++ b/mmdet/core/bbox/samplers/pseudo_sampler.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            bboxes (torch.Tensor): Bounding boxes
+            gt_bboxes (torch.Tensor): Ground truth boxes
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/mmdet/core/bbox/samplers/random_sampler.py b/mmdet/core/bbox/samplers/random_sampler.py
new file mode 100755
index 0000000..8d3effc
--- /dev/null
+++ b/mmdet/core/bbox/samplers/random_sampler.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class RandomSampler(BaseSampler):
+    """Random sampler.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_ub (int, optional): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool, optional): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        from mmdet.core.bbox import demodata
+        super(RandomSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                            add_gt_as_proposals)
+        self.rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+    def random_choice(self, gallery, num):
+        """Random select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor | ndarray | list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        # This is a temporary fix. We can revert the following code
+        # when PyTorch fixes the abnormal return of torch.randperm.
+        # See: https://github.com/open-mmlab/mmdetection/pull/5014
+        perm = torch.randperm(gallery.numel())[:num].to(device=gallery.device)
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some negative samples."""
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/mmdet/core/bbox/samplers/sampling_result.py b/mmdet/core/bbox/samplers/sampling_result.py
new file mode 100755
index 0000000..11a02c5
--- /dev/null
+++ b/mmdet/core/bbox/samplers/sampling_result.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.utils import util_mixins
+
+
+class SamplingResult(util_mixins.NiceRepr):
+    """Bbox sampling result.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_bboxes': torch.Size([12, 4]),
+            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
+            'num_gts': 4,
+            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
+            'pos_bboxes': torch.Size([0, 4]),
+            'pos_inds': tensor([], dtype=torch.int64),
+            'pos_is_gt': tensor([], dtype=torch.uint8)
+        })>
+    """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = bboxes[pos_inds]
+        self.neg_bboxes = bboxes[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long(), :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_bboxes, self.neg_bboxes])
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, torch.Tensor):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_bboxes'] = data.pop('pos_bboxes').shape
+        data['neg_bboxes'] = data.pop('neg_bboxes').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_bboxes': self.pos_bboxes,
+            'neg_bboxes': self.neg_bboxes,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: number of predicted boxes
+                - num_gts: number of true boxes
+                - p_ignore (float): probability of a predicted box assigned to \
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being \
+                    assigned.
+                - p_use_label (float | bool): with labels or not.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmdet.core.bbox import demodata
+        from mmdet.core.bbox.assigners.assign_result import AssignResult
+        from mmdet.core.bbox.samplers.random_sampler import RandomSampler
+        rng = demodata.ensure_rng(rng)
+
+        # make probabilistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
+
+        if rng.rand() > 0.2:
+            # sometimes algorithms squeeze their data, be robust to that
+            gt_bboxes = gt_bboxes.squeeze()
+            bboxes = bboxes.squeeze()
+
+        if assign_result.labels is None:
+            gt_labels = None
+        else:
+            gt_labels = None  # todo
+
+        if gt_labels is None:
+            add_gt_as_proposals = False
+        else:
+            add_gt_as_proposals = True  # make probabilistic?
+
+        sampler = RandomSampler(
+            num,
+            pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals,
+            rng=rng)
+        self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        return self
diff --git a/mmdet/core/bbox/samplers/score_hlr_sampler.py b/mmdet/core/bbox/samplers/score_hlr_sampler.py
new file mode 100755
index 0000000..f4be9b8
--- /dev/null
+++ b/mmdet/core/bbox/samplers/score_hlr_sampler.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import nms_match
+
+from ..builder import BBOX_SAMPLERS
+from ..transforms import bbox2roi
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class ScoreHLRSampler(BaseSampler):
+    r"""Importance-based Sample Reweighting (ISR_N), described in `Prime Sample
+    Attention in Object Detection <https://arxiv.org/abs/1904.04821>`_.
+
+    Score hierarchical local rank (HLR) differentiates with RandomSampler in
+    negative part. It firstly computes Score-HLR in a two-step way,
+    then linearly maps score hlr to the loss weights.
+
+    Args:
+        num (int): Total number of sampled RoIs.
+        pos_fraction (float): Fraction of positive samples.
+        context (:class:`BaseRoIHead`): RoI head that the sampler belongs to.
+        neg_pos_ub (int): Upper bound of the ratio of num negative to num
+            positive, -1 means no upper bound.
+        add_gt_as_proposals (bool): Whether to add ground truth as proposals.
+        k (float): Power of the non-linear mapping.
+        bias (float): Shift of the non-linear mapping.
+        score_thr (float): Minimum score that a negative sample is to be
+            considered as valid bbox.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 k=0.5,
+                 bias=0,
+                 score_thr=0.05,
+                 iou_thr=0.5,
+                 **kwargs):
+        super().__init__(num, pos_fraction, neg_pos_ub, add_gt_as_proposals)
+        self.k = k
+        self.bias = bias
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+        self.context = context
+        # context of cascade detectors is a list, so distinguish them here.
+        if not hasattr(context, 'num_stages'):
+            self.bbox_roi_extractor = context.bbox_roi_extractor
+            self.bbox_head = context.bbox_head
+            self.with_shared_head = context.with_shared_head
+            if self.with_shared_head:
+                self.shared_head = context.shared_head
+        else:
+            self.bbox_roi_extractor = context.bbox_roi_extractor[
+                context.current_stage]
+            self.bbox_head = context.bbox_head[context.current_stage]
+
+    @staticmethod
+    def random_choice(gallery, num):
+        """Randomly select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor | ndarray | list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        perm = torch.randperm(gallery.numel(), device=gallery.device)[:num]
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0).flatten()
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes,
+                    feats=None,
+                    img_meta=None,
+                    **kwargs):
+        """Sample negative samples.
+
+        Score-HLR sampler is done in the following steps:
+        1. Take the maximum positive score prediction of each negative samples
+            as s_i.
+        2. Filter out negative samples whose s_i <= score_thr, the left samples
+            are called valid samples.
+        3. Use NMS-Match to divide valid samples into different groups,
+            samples in the same group will greatly overlap with each other
+        4. Rank the matched samples in two-steps to get Score-HLR.
+            (1) In the same group, rank samples with their scores.
+            (2) In the same score rank across different groups,
+                rank samples with their scores again.
+        5. Linearly map Score-HLR to the final label weights.
+
+        Args:
+            assign_result (:obj:`AssignResult`): result of assigner.
+            num_expected (int): Expected number of samples.
+            bboxes (Tensor): bbox to be sampled.
+            feats (Tensor): Features come from FPN.
+            img_meta (dict): Meta information dictionary.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0).flatten()
+        num_neg = neg_inds.size(0)
+        if num_neg == 0:
+            return neg_inds, None
+        with torch.no_grad():
+            neg_bboxes = bboxes[neg_inds]
+            neg_rois = bbox2roi([neg_bboxes])
+            bbox_result = self.context._bbox_forward(feats, neg_rois)
+            cls_score, bbox_pred = bbox_result['cls_score'], bbox_result[
+                'bbox_pred']
+
+            ori_loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=None,
+                labels=neg_inds.new_full((num_neg, ),
+                                         self.bbox_head.num_classes),
+                label_weights=cls_score.new_ones(num_neg),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')['loss_cls']
+
+            # filter out samples with the max score lower than score_thr
+            max_score, argmax_score = cls_score.softmax(-1)[:, :-1].max(-1)
+            valid_inds = (max_score > self.score_thr).nonzero().view(-1)
+            invalid_inds = (max_score <= self.score_thr).nonzero().view(-1)
+            num_valid = valid_inds.size(0)
+            num_invalid = invalid_inds.size(0)
+
+            num_expected = min(num_neg, num_expected)
+            num_hlr = min(num_valid, num_expected)
+            num_rand = num_expected - num_hlr
+            if num_valid > 0:
+                valid_rois = neg_rois[valid_inds]
+                valid_max_score = max_score[valid_inds]
+                valid_argmax_score = argmax_score[valid_inds]
+                valid_bbox_pred = bbox_pred[valid_inds]
+
+                # valid_bbox_pred shape: [num_valid, #num_classes, 4]
+                valid_bbox_pred = valid_bbox_pred.view(
+                    valid_bbox_pred.size(0), -1, 4)
+                selected_bbox_pred = valid_bbox_pred[range(num_valid),
+                                                     valid_argmax_score]
+                pred_bboxes = self.bbox_head.bbox_coder.decode(
+                    valid_rois[:, 1:], selected_bbox_pred)
+                pred_bboxes_with_score = torch.cat(
+                    [pred_bboxes, valid_max_score[:, None]], -1)
+                group = nms_match(pred_bboxes_with_score, self.iou_thr)
+
+                # imp: importance
+                imp = cls_score.new_zeros(num_valid)
+                for g in group:
+                    g_score = valid_max_score[g]
+                    # g_score has already sorted
+                    rank = g_score.new_tensor(range(g_score.size(0)))
+                    imp[g] = num_valid - rank + g_score
+                _, imp_rank_inds = imp.sort(descending=True)
+                _, imp_rank = imp_rank_inds.sort()
+                hlr_inds = imp_rank_inds[:num_expected]
+
+                if num_rand > 0:
+                    rand_inds = torch.randperm(num_invalid)[:num_rand]
+                    select_inds = torch.cat(
+                        [valid_inds[hlr_inds], invalid_inds[rand_inds]])
+                else:
+                    select_inds = valid_inds[hlr_inds]
+
+                neg_label_weights = cls_score.new_ones(num_expected)
+
+                up_bound = max(num_expected, num_valid)
+                imp_weights = (up_bound -
+                               imp_rank[hlr_inds].float()) / up_bound
+                neg_label_weights[:num_hlr] = imp_weights
+                neg_label_weights[num_hlr:] = imp_weights.min()
+                neg_label_weights = (self.bias +
+                                     (1 - self.bias) * neg_label_weights).pow(
+                                         self.k)
+                ori_selected_loss = ori_loss[select_inds]
+                new_loss = ori_selected_loss * neg_label_weights
+                norm_ratio = ori_selected_loss.sum() / new_loss.sum()
+                neg_label_weights *= norm_ratio
+            else:
+                neg_label_weights = cls_score.new_ones(num_expected)
+                select_inds = torch.randperm(num_neg)[:num_expected]
+
+            return neg_inds[select_inds], neg_label_weights
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               img_meta=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            tuple[:obj:`SamplingResult`, Tensor]: Sampling result and negative
+                label weights.
+        """
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals:
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds, neg_label_weights = self.neg_sampler._sample_neg(
+            assign_result,
+            num_expected_neg,
+            bboxes,
+            img_meta=img_meta,
+            **kwargs)
+
+        return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                              assign_result, gt_flags), neg_label_weights
diff --git a/mmdet/core/bbox/transforms.py b/mmdet/core/bbox/transforms.py
new file mode 100755
index 0000000..6d72076
--- /dev/null
+++ b/mmdet/core/bbox/transforms.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def find_inside_bboxes(bboxes, img_h, img_w):
+    """Find bboxes as long as a part of bboxes is inside the image.
+
+    Args:
+        bboxes (Tensor): Shape (N, 4).
+        img_h (int): Image height.
+        img_w (int): Image width.
+
+    Returns:
+        Tensor: Index of the remaining bboxes.
+    """
+    inside_inds = (bboxes[:, 0] < img_w) & (bboxes[:, 2] > 0) \
+        & (bboxes[:, 1] < img_h) & (bboxes[:, 3] > 0)
+    return inside_inds
+
+
+def bbox_flip(bboxes, img_shape, direction='horizontal'):
+    """Flip bboxes horizontally or vertically.
+
+    Args:
+        bboxes (Tensor): Shape (..., 4*k)
+        img_shape (tuple): Image shape.
+        direction (str): Flip direction, options are "horizontal", "vertical",
+            "diagonal". Default: "horizontal"
+
+    Returns:
+        Tensor: Flipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    flipped = bboxes.clone()
+    if direction == 'horizontal':
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    else:
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    return flipped
+
+
+def bbox_mapping(bboxes,
+                 img_shape,
+                 scale_factor,
+                 flip,
+                 flip_direction='horizontal'):
+    """Map bboxes from the original image scale to testing scale."""
+    new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes,
+                      img_shape,
+                      scale_factor,
+                      flip,
+                      flip_direction='horizontal'):
+    """Map bboxes from testing scale to original image scale."""
+    new_bboxes = bbox_flip(bboxes, img_shape,
+                           flip_direction) if flip else bboxes
+    new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+    return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois):
+    """Convert rois to bounding box format.
+
+    Args:
+        rois (torch.Tensor): RoIs with the shape (n, 5) where the first
+            column indicates batch id of each RoI.
+
+    Returns:
+        list[torch.Tensor]: Converted boxes of corresponding rois.
+    """
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+def bbox2result(bboxes, labels, num_classes):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor | np.ndarray): shape (n, 5)
+        labels (torch.Tensor | np.ndarray): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        list(ndarray): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        if isinstance(bboxes, torch.Tensor):
+            bboxes = bboxes.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (B, N, 2) or (N, 2).
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4)
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4) or (B, N, 4)
+    """
+
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    if max_shape is not None:
+        if bboxes.dim() == 2 and not torch.onnx.is_in_onnx_export():
+            # speed up
+            bboxes[:, 0::2].clamp_(min=0, max=max_shape[1])
+            bboxes[:, 1::2].clamp_(min=0, max=max_shape[0])
+            return bboxes
+
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+def bbox_rescale(bboxes, scale_factor=1.0):
+    """Rescale bounding box w.r.t. scale_factor.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+        scale_factor (float): rescale factor
+
+    Returns:
+        Tensor: Rescaled bboxes.
+    """
+    if bboxes.size(1) == 5:
+        bboxes_ = bboxes[:, 1:]
+        inds_ = bboxes[:, 0]
+    else:
+        bboxes_ = bboxes
+    cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+    cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+    w = bboxes_[:, 2] - bboxes_[:, 0]
+    h = bboxes_[:, 3] - bboxes_[:, 1]
+    w = w * scale_factor
+    h = h * scale_factor
+    x1 = cx - 0.5 * w
+    x2 = cx + 0.5 * w
+    y1 = cy - 0.5 * h
+    y2 = cy + 0.5 * h
+    if bboxes.size(1) == 5:
+        rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+    else:
+        rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+    return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox):
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox):
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
diff --git a/mmdet/core/data_structures/__init__.py b/mmdet/core/data_structures/__init__.py
new file mode 100755
index 0000000..11ab96c
--- /dev/null
+++ b/mmdet/core/data_structures/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .general_data import GeneralData
+from .instance_data import InstanceData
+
+__all__ = ['GeneralData', 'InstanceData']
diff --git a/mmdet/core/data_structures/general_data.py b/mmdet/core/data_structures/general_data.py
new file mode 100755
index 0000000..978fdfd
--- /dev/null
+++ b/mmdet/core/data_structures/general_data.py
@@ -0,0 +1,336 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch
+
+from mmdet.utils.util_mixins import NiceRepr
+
+
+class GeneralData(NiceRepr):
+    """A general data structure of OpenMMlab.
+
+    A data structure that stores the meta information,
+    the annotations of the images or the model predictions,
+    which can be used in communication between components.
+
+    The attributes in `GeneralData` are divided into two parts,
+    the `meta_info_fields` and the `data_fields` respectively.
+
+        - `meta_info_fields`: Usually contains the
+          information about the image such as filename,
+          image_shape, pad_shape, etc. All attributes in
+          it are immutable once set,
+          but the user can add new meta information with
+          `set_meta_info` function, all information can be accessed
+          with methods `meta_info_keys`, `meta_info_values`,
+          `meta_info_items`.
+
+        - `data_fields`: Annotations or model predictions are
+          stored. The attributes can be accessed or modified by
+          dict-like or object-like operations, such as
+          `.` , `[]`, `in`, `del`, `pop(str)` `get(str)`, `keys()`,
+          `values()`, `items()`. Users can also apply tensor-like methods
+          to all obj:`torch.Tensor` in the `data_fileds`,
+          such as `.cuda()`, `.cpu()`, `.numpy()`, `device`, `.to()`
+          `.detach()`, `.numpy()`
+
+    Args:
+        meta_info (dict, optional): A dict contains the meta information
+            of single image. such as `img_shape`, `scale_factor`, etc.
+            Default: None.
+        data (dict, optional): A dict contains annotations of single image or
+            model predictions. Default: None.
+
+    Examples:
+        >>> from mmdet.core import GeneralData
+        >>> img_meta = dict(img_shape=(800, 1196, 3), pad_shape=(800, 1216, 3))
+        >>> instance_data = GeneralData(meta_info=img_meta)
+        >>> img_shape in instance_data
+        True
+        >>> instance_data.det_labels = torch.LongTensor([0, 1, 2, 3])
+        >>> instance_data["det_scores"] = torch.Tensor([0.01, 0.1, 0.2, 0.3])
+        >>> print(results)
+        <GeneralData(
+
+          META INFORMATION
+        img_shape: (800, 1196, 3)
+        pad_shape: (800, 1216, 3)
+
+          DATA FIELDS
+        shape of det_labels: torch.Size([4])
+        shape of det_scores: torch.Size([4])
+
+        ) at 0x7f84acd10f90>
+        >>> instance_data.det_scores
+        tensor([0.0100, 0.1000, 0.2000, 0.3000])
+        >>> instance_data.det_labels
+        tensor([0, 1, 2, 3])
+        >>> instance_data['det_labels']
+        tensor([0, 1, 2, 3])
+        >>> 'det_labels' in instance_data
+        True
+        >>> instance_data.img_shape
+        (800, 1196, 3)
+        >>> 'det_scores' in instance_data
+        True
+        >>> del instance_data.det_scores
+        >>> 'det_scores' in instance_data
+        False
+        >>> det_labels = instance_data.pop('det_labels', None)
+        >>> det_labels
+        tensor([0, 1, 2, 3])
+        >>> 'det_labels' in instance_data
+        >>> False
+    """
+
+    def __init__(self, meta_info=None, data=None):
+
+        self._meta_info_fields = set()
+        self._data_fields = set()
+
+        if meta_info is not None:
+            self.set_meta_info(meta_info=meta_info)
+        if data is not None:
+            self.set_data(data)
+
+    def set_meta_info(self, meta_info):
+        """Add meta information.
+
+        Args:
+            meta_info (dict): A dict contains the meta information
+                of image. such as `img_shape`, `scale_factor`, etc.
+                Default: None.
+        """
+        assert isinstance(meta_info,
+                          dict), f'meta should be a `dict` but get {meta_info}'
+        meta = copy.deepcopy(meta_info)
+        for k, v in meta.items():
+            # should be consistent with original meta_info
+            if k in self._meta_info_fields:
+                ori_value = getattr(self, k)
+                if isinstance(ori_value, (torch.Tensor, np.ndarray)):
+                    if (ori_value == v).all():
+                        continue
+                    else:
+                        raise KeyError(
+                            f'img_meta_info {k} has been set as '
+                            f'{getattr(self, k)} before, which is immutable ')
+                elif ori_value == v:
+                    continue
+                else:
+                    raise KeyError(
+                        f'img_meta_info {k} has been set as '
+                        f'{getattr(self, k)} before, which is immutable ')
+            else:
+                self._meta_info_fields.add(k)
+                self.__dict__[k] = v
+
+    def set_data(self, data):
+        """Update a dict to `data_fields`.
+
+        Args:
+            data (dict): A dict contains annotations of image or
+                model predictions. Default: None.
+        """
+        assert isinstance(data,
+                          dict), f'meta should be a `dict` but get {data}'
+        for k, v in data.items():
+            self.__setattr__(k, v)
+
+    def new(self, meta_info=None, data=None):
+        """Return a new results with same image meta information.
+
+        Args:
+            meta_info (dict, optional): A dict contains the meta information
+                of image. such as `img_shape`, `scale_factor`, etc.
+                Default: None.
+            data (dict, optional): A dict contains annotations of image or
+                model predictions. Default: None.
+        """
+        new_data = self.__class__()
+        new_data.set_meta_info(dict(self.meta_info_items()))
+        if meta_info is not None:
+            new_data.set_meta_info(meta_info)
+        if data is not None:
+            new_data.set_data(data)
+        return new_data
+
+    def keys(self):
+        """
+        Returns:
+            list: Contains all keys in data_fields.
+        """
+        return [key for key in self._data_fields]
+
+    def meta_info_keys(self):
+        """
+        Returns:
+            list: Contains all keys in meta_info_fields.
+        """
+        return [key for key in self._meta_info_fields]
+
+    def values(self):
+        """
+        Returns:
+            list: Contains all values in data_fields.
+        """
+        return [getattr(self, k) for k in self.keys()]
+
+    def meta_info_values(self):
+        """
+        Returns:
+            list: Contains all values in meta_info_fields.
+        """
+        return [getattr(self, k) for k in self.meta_info_keys()]
+
+    def items(self):
+        for k in self.keys():
+            yield (k, getattr(self, k))
+
+    def meta_info_items(self):
+        for k in self.meta_info_keys():
+            yield (k, getattr(self, k))
+
+    def __setattr__(self, name, val):
+        if name in ('_meta_info_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, val)
+            else:
+                raise AttributeError(
+                    f'{name} has been used as a '
+                    f'private attribute, which is immutable. ')
+        else:
+            if name in self._meta_info_fields:
+                raise AttributeError(f'`{name}` is used in meta information,'
+                                     f'which is immutable')
+
+            self._data_fields.add(name)
+            super().__setattr__(name, val)
+
+    def __delattr__(self, item):
+
+        if item in ('_meta_info_fields', '_data_fields'):
+            raise AttributeError(f'{item} has been used as a '
+                                 f'private attribute, which is immutable. ')
+
+        if item in self._meta_info_fields:
+            raise KeyError(f'{item} is used in meta information, '
+                           f'which is immutable.')
+        super().__delattr__(item)
+        if item in self._data_fields:
+            self._data_fields.remove(item)
+
+    # dict-like methods
+    __setitem__ = __setattr__
+    __delitem__ = __delattr__
+
+    def __getitem__(self, name):
+        return getattr(self, name)
+
+    def get(self, *args):
+        assert len(args) < 3, '`get` get more than 2 arguments'
+        return self.__dict__.get(*args)
+
+    def pop(self, *args):
+        assert len(args) < 3, '`pop` get more than 2 arguments'
+        name = args[0]
+        if name in self._meta_info_fields:
+            raise KeyError(f'{name} is a key in meta information, '
+                           f'which is immutable')
+
+        if args[0] in self._data_fields:
+            self._data_fields.remove(args[0])
+            return self.__dict__.pop(*args)
+
+        # with default value
+        elif len(args) == 2:
+            return args[1]
+        else:
+            raise KeyError(f'{args[0]}')
+
+    def __contains__(self, item):
+        return item in self._data_fields or \
+                    item in self._meta_info_fields
+
+    # Tensor-like methods
+    def to(self, *args, **kwargs):
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if hasattr(v, 'to'):
+                v = v.to(*args, **kwargs)
+            new_data[k] = v
+        return new_data
+
+    # Tensor-like methods
+    def cpu(self):
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                v = v.cpu()
+            new_data[k] = v
+        return new_data
+
+    # Tensor-like methods
+    def npu(self):
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                v = v.npu()
+            new_data[k] = v
+        return new_data
+
+    # Tensor-like methods
+    def mlu(self):
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                v = v.mlu()
+            new_data[k] = v
+        return new_data
+
+    # Tensor-like methods
+    def cuda(self):
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                v = v.cuda()
+            new_data[k] = v
+        return new_data
+
+    # Tensor-like methods
+    def detach(self):
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                v = v.detach()
+            new_data[k] = v
+        return new_data
+
+    # Tensor-like methods
+    def numpy(self):
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                v = v.detach().cpu().numpy()
+            new_data[k] = v
+        return new_data
+
+    def __nice__(self):
+        repr = '\n \n  META INFORMATION \n'
+        for k, v in self.meta_info_items():
+            repr += f'{k}: {v} \n'
+        repr += '\n   DATA FIELDS \n'
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, np.ndarray)):
+                repr += f'shape of {k}: {v.shape} \n'
+            else:
+                repr += f'{k}: {v} \n'
+        return repr + '\n'
diff --git a/mmdet/core/data_structures/instance_data.py b/mmdet/core/data_structures/instance_data.py
new file mode 100755
index 0000000..eef2065
--- /dev/null
+++ b/mmdet/core/data_structures/instance_data.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+
+import numpy as np
+import torch
+
+from .general_data import GeneralData
+
+
+class InstanceData(GeneralData):
+    """Data structure for instance-level annnotations or predictions.
+
+    Subclass of :class:`GeneralData`. All value in `data_fields`
+    should have the same length. This design refer to
+    https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/instances.py # noqa E501
+
+    Examples:
+        >>> from mmdet.core import InstanceData
+        >>> import numpy as np
+        >>> img_meta = dict(img_shape=(800, 1196, 3), pad_shape=(800, 1216, 3))
+        >>> results = InstanceData(img_meta)
+        >>> img_shape in results
+        True
+        >>> results.det_labels = torch.LongTensor([0, 1, 2, 3])
+        >>> results["det_scores"] = torch.Tensor([0.01, 0.7, 0.6, 0.3])
+        >>> results["det_masks"] = np.ndarray(4, 2, 2)
+        >>> len(results)
+        4
+        >>> print(resutls)
+        <InstanceData(
+
+            META INFORMATION
+        pad_shape: (800, 1216, 3)
+        img_shape: (800, 1196, 3)
+
+            PREDICTIONS
+        shape of det_labels: torch.Size([4])
+        shape of det_masks: (4, 2, 2)
+        shape of det_scores: torch.Size([4])
+
+        ) at 0x7fe26b5ca990>
+        >>> sorted_results = results[results.det_scores.sort().indices]
+        >>> sorted_results.det_scores
+        tensor([0.0100, 0.3000, 0.6000, 0.7000])
+        >>> sorted_results.det_labels
+        tensor([0, 3, 2, 1])
+        >>> print(results[results.scores > 0.5])
+        <InstanceData(
+
+            META INFORMATION
+        pad_shape: (800, 1216, 3)
+        img_shape: (800, 1196, 3)
+
+            PREDICTIONS
+        shape of det_labels: torch.Size([2])
+        shape of det_masks: (2, 2, 2)
+        shape of det_scores: torch.Size([2])
+
+        ) at 0x7fe26b6d7790>
+        >>> results[results.det_scores > 0.5].det_labels
+        tensor([1, 2])
+        >>> results[results.det_scores > 0.5].det_scores
+        tensor([0.7000, 0.6000])
+    """
+
+    def __setattr__(self, name, value):
+
+        if name in ('_meta_info_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(
+                    f'{name} has been used as a '
+                    f'private attribute, which is immutable. ')
+
+        else:
+            assert isinstance(value, (torch.Tensor, np.ndarray, list)), \
+                f'Can set {type(value)}, only support' \
+                f' {(torch.Tensor, np.ndarray, list)}'
+
+            if self._data_fields:
+                assert len(value) == len(self), f'the length of ' \
+                                             f'values {len(value)} is ' \
+                                             f'not consistent with' \
+                                             f' the length ' \
+                                             f'of this :obj:`InstanceData` ' \
+                                             f'{len(self)} '
+            super().__setattr__(name, value)
+
+    def __getitem__(self, item):
+        """
+        Args:
+            item (str, obj:`slice`,
+                obj`torch.LongTensor`, obj:`torch.BoolTensor`):
+                get the corresponding values according to item.
+
+        Returns:
+            obj:`InstanceData`: Corresponding values.
+        """
+        assert len(self), ' This is a empty instance'
+
+        assert isinstance(
+            item, (str, slice, int, torch.LongTensor, torch.BoolTensor))
+
+        if isinstance(item, str):
+            return getattr(self, item)
+
+        if type(item) == int:
+            if item >= len(self) or item < -len(self):
+                raise IndexError(f'Index {item} out of range!')
+            else:
+                # keep the dimension
+                item = slice(item, None, len(self))
+
+        new_data = self.new()
+        if isinstance(item, (torch.Tensor)):
+            assert item.dim() == 1, 'Only support to get the' \
+                                 ' values along the first dimension.'
+            if isinstance(item, torch.BoolTensor):
+                assert len(item) == len(self), f'The shape of the' \
+                                               f' input(BoolTensor)) ' \
+                                               f'{len(item)} ' \
+                                               f' does not match the shape ' \
+                                               f'of the indexed tensor ' \
+                                               f'in results_filed ' \
+                                               f'{len(self)} at ' \
+                                               f'first dimension. '
+
+            for k, v in self.items():
+                if isinstance(v, torch.Tensor):
+                    new_data[k] = v[item]
+                elif isinstance(v, np.ndarray):
+                    new_data[k] = v[item.cpu().numpy()]
+                elif isinstance(v, list):
+                    r_list = []
+                    # convert to indexes from boolTensor
+                    if isinstance(item, torch.BoolTensor):
+                        indexes = torch.nonzero(item).view(-1)
+                    else:
+                        indexes = item
+                    for index in indexes:
+                        r_list.append(v[index])
+                    new_data[k] = r_list
+        else:
+            # item is a slice
+            for k, v in self.items():
+                new_data[k] = v[item]
+        return new_data
+
+    @staticmethod
+    def cat(instances_list):
+        """Concat the predictions of all :obj:`InstanceData` in the list.
+
+        Args:
+            instances_list (list[:obj:`InstanceData`]): A list
+                of :obj:`InstanceData`.
+
+        Returns:
+            obj:`InstanceData`
+        """
+        assert all(
+            isinstance(results, InstanceData) for results in instances_list)
+        assert len(instances_list) > 0
+        if len(instances_list) == 1:
+            return instances_list[0]
+
+        new_data = instances_list[0].new()
+        for k in instances_list[0]._data_fields:
+            values = [results[k] for results in instances_list]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                values = torch.cat(values, dim=0)
+            elif isinstance(v0, np.ndarray):
+                values = np.concatenate(values, axis=0)
+            elif isinstance(v0, list):
+                values = list(itertools.chain(*values))
+            else:
+                raise ValueError(
+                    f'Can not concat the {k} which is a {type(v0)}')
+            new_data[k] = values
+        return new_data
+
+    def __len__(self):
+        if len(self._data_fields):
+            for v in self.values():
+                return len(v)
+        else:
+            raise AssertionError('This is an empty `InstanceData`.')
diff --git a/mmdet/core/evaluation/__init__.py b/mmdet/core/evaluation/__init__.py
new file mode 100755
index 0000000..2b488a7
--- /dev/null
+++ b/mmdet/core/evaluation/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .class_names import (cityscapes_classes, coco_classes, dataset_aliases,
+                          get_classes, imagenet_det_classes,
+                          imagenet_vid_classes, objects365v1_classes,
+                          objects365v2_classes, oid_challenge_classes,
+                          oid_v6_classes, voc_classes)
+from .eval_hooks import DistEvalHook, EvalHook
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .panoptic_utils import INSTANCE_OFFSET
+from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
+                     print_recall_summary)
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes',
+    'DistEvalHook', 'EvalHook', 'average_precision', 'eval_map',
+    'print_map_summary', 'eval_recalls', 'print_recall_summary',
+    'plot_num_recall', 'plot_iou_recall', 'oid_v6_classes',
+    'oid_challenge_classes', 'objects365v1_classes', 'objects365v2_classes',
+    'INSTANCE_OFFSET'
+]
diff --git a/mmdet/core/evaluation/bbox_overlaps.py b/mmdet/core/evaluation/bbox_overlaps.py
new file mode 100755
index 0000000..5d6eb82
--- /dev/null
+++ b/mmdet/core/evaluation/bbox_overlaps.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def bbox_overlaps(bboxes1,
+                  bboxes2,
+                  mode='iou',
+                  eps=1e-6,
+                  use_legacy_coordinate=False):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (ndarray): Shape (n, 4)
+        bboxes2 (ndarray): Shape (k, 4)
+        mode (str): IOU (intersection over union) or IOF (intersection
+            over foreground)
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Note when function is used in `VOCDataset`, it should be
+            True to align with the official implementation
+            `http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar`
+            Default: False.
+
+    Returns:
+        ious (ndarray): Shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + extra_length) * (
+        bboxes1[:, 3] - bboxes1[:, 1] + extra_length)
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + extra_length) * (
+        bboxes2[:, 3] - bboxes2[:, 1] + extra_length)
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start + extra_length, 0) * np.maximum(
+            y_end - y_start + extra_length, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        union = np.maximum(union, eps)
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/mmdet/core/evaluation/class_names.py b/mmdet/core/evaluation/class_names.py
new file mode 100755
index 0000000..c015c5d
--- /dev/null
+++ b/mmdet/core/evaluation/class_names.py
@@ -0,0 +1,476 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+
+def wider_face_classes():
+    return ['face']
+
+
+def voc_classes():
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes():
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes():
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes():
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def cityscapes_classes():
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def oid_challenge_classes():
+    return [
+        'Footwear', 'Jeans', 'House', 'Tree', 'Woman', 'Man', 'Land vehicle',
+        'Person', 'Wheel', 'Bus', 'Human face', 'Bird', 'Dress', 'Girl',
+        'Vehicle', 'Building', 'Cat', 'Car', 'Belt', 'Elephant', 'Dessert',
+        'Butterfly', 'Train', 'Guitar', 'Poster', 'Book', 'Boy', 'Bee',
+        'Flower', 'Window', 'Hat', 'Human head', 'Dog', 'Human arm', 'Drink',
+        'Human mouth', 'Human hair', 'Human nose', 'Human hand', 'Table',
+        'Marine invertebrates', 'Fish', 'Sculpture', 'Rose', 'Street light',
+        'Glasses', 'Fountain', 'Skyscraper', 'Swimwear', 'Brassiere', 'Drum',
+        'Duck', 'Countertop', 'Furniture', 'Ball', 'Human leg', 'Boat',
+        'Balloon', 'Bicycle helmet', 'Goggles', 'Door', 'Human eye', 'Shirt',
+        'Toy', 'Teddy bear', 'Pasta', 'Tomato', 'Human ear',
+        'Vehicle registration plate', 'Microphone', 'Musical keyboard',
+        'Tower', 'Houseplant', 'Flowerpot', 'Fruit', 'Vegetable',
+        'Musical instrument', 'Suit', 'Motorcycle', 'Bagel', 'French fries',
+        'Hamburger', 'Chair', 'Salt and pepper shakers', 'Snail', 'Airplane',
+        'Horse', 'Laptop', 'Computer keyboard', 'Football helmet', 'Cocktail',
+        'Juice', 'Tie', 'Computer monitor', 'Human beard', 'Bottle',
+        'Saxophone', 'Lemon', 'Mouse', 'Sock', 'Cowboy hat', 'Sun hat',
+        'Football', 'Porch', 'Sunglasses', 'Lobster', 'Crab', 'Picture frame',
+        'Van', 'Crocodile', 'Surfboard', 'Shorts', 'Helicopter', 'Helmet',
+        'Sports uniform', 'Taxi', 'Swan', 'Goose', 'Coat', 'Jacket', 'Handbag',
+        'Flag', 'Skateboard', 'Television', 'Tire', 'Spoon', 'Palm tree',
+        'Stairs', 'Salad', 'Castle', 'Oven', 'Microwave oven', 'Wine',
+        'Ceiling fan', 'Mechanical fan', 'Cattle', 'Truck', 'Box', 'Ambulance',
+        'Desk', 'Wine glass', 'Reptile', 'Tank', 'Traffic light', 'Billboard',
+        'Tent', 'Insect', 'Spider', 'Treadmill', 'Cupboard', 'Shelf',
+        'Seat belt', 'Human foot', 'Bicycle', 'Bicycle wheel', 'Couch',
+        'Bookcase', 'Fedora', 'Backpack', 'Bench', 'Oyster',
+        'Moths and butterflies', 'Lavender', 'Waffle', 'Fork', 'Animal',
+        'Accordion', 'Mobile phone', 'Plate', 'Coffee cup', 'Saucer',
+        'Platter', 'Dagger', 'Knife', 'Bull', 'Tortoise', 'Sea turtle', 'Deer',
+        'Weapon', 'Apple', 'Ski', 'Taco', 'Traffic sign', 'Beer', 'Necklace',
+        'Sunflower', 'Piano', 'Organ', 'Harpsichord', 'Bed', 'Cabinetry',
+        'Nightstand', 'Curtain', 'Chest of drawers', 'Drawer', 'Parrot',
+        'Sandal', 'High heels', 'Tableware', 'Cart', 'Mushroom', 'Kite',
+        'Missile', 'Seafood', 'Camera', 'Paper towel', 'Toilet paper',
+        'Sombrero', 'Radish', 'Lighthouse', 'Segway', 'Pig', 'Watercraft',
+        'Golf cart', 'studio couch', 'Dolphin', 'Whale', 'Earrings', 'Otter',
+        'Sea lion', 'Whiteboard', 'Monkey', 'Gondola', 'Zebra',
+        'Baseball glove', 'Scarf', 'Adhesive tape', 'Trousers', 'Scoreboard',
+        'Lily', 'Carnivore', 'Power plugs and sockets', 'Office building',
+        'Sandwich', 'Swimming pool', 'Headphones', 'Tin can', 'Crown', 'Doll',
+        'Cake', 'Frog', 'Beetle', 'Ant', 'Gas stove', 'Canoe', 'Falcon',
+        'Blue jay', 'Egg', 'Fire hydrant', 'Raccoon', 'Muffin', 'Wall clock',
+        'Coffee', 'Mug', 'Tea', 'Bear', 'Waste container', 'Home appliance',
+        'Candle', 'Lion', 'Mirror', 'Starfish', 'Marine mammal', 'Wheelchair',
+        'Umbrella', 'Alpaca', 'Violin', 'Cello', 'Brown bear', 'Canary', 'Bat',
+        'Ruler', 'Plastic bag', 'Penguin', 'Watermelon', 'Harbor seal', 'Pen',
+        'Pumpkin', 'Harp', 'Kitchen appliance', 'Roller skates', 'Bust',
+        'Coffee table', 'Tennis ball', 'Tennis racket', 'Ladder', 'Boot',
+        'Bowl', 'Stop sign', 'Volleyball', 'Eagle', 'Paddle', 'Chicken',
+        'Skull', 'Lamp', 'Beehive', 'Maple', 'Sink', 'Goldfish', 'Tripod',
+        'Coconut', 'Bidet', 'Tap', 'Bathroom cabinet', 'Toilet',
+        'Filing cabinet', 'Pretzel', 'Table tennis racket', 'Bronze sculpture',
+        'Rocket', 'Mouse', 'Hamster', 'Lizard', 'Lifejacket', 'Goat',
+        'Washing machine', 'Trumpet', 'Horn', 'Trombone', 'Sheep',
+        'Tablet computer', 'Pillow', 'Kitchen & dining room table',
+        'Parachute', 'Raven', 'Glove', 'Loveseat', 'Christmas tree',
+        'Shellfish', 'Rifle', 'Shotgun', 'Sushi', 'Sparrow', 'Bread',
+        'Toaster', 'Watch', 'Asparagus', 'Artichoke', 'Suitcase', 'Antelope',
+        'Broccoli', 'Ice cream', 'Racket', 'Banana', 'Cookie', 'Cucumber',
+        'Dragonfly', 'Lynx', 'Caterpillar', 'Light bulb', 'Office supplies',
+        'Miniskirt', 'Skirt', 'Fireplace', 'Potato', 'Light switch',
+        'Croissant', 'Cabbage', 'Ladybug', 'Handgun', 'Luggage and bags',
+        'Window blind', 'Snowboard', 'Baseball bat', 'Digital clock',
+        'Serving tray', 'Infant bed', 'Sofa bed', 'Guacamole', 'Fox', 'Pizza',
+        'Snowplow', 'Jet ski', 'Refrigerator', 'Lantern', 'Convenience store',
+        'Sword', 'Rugby ball', 'Owl', 'Ostrich', 'Pancake', 'Strawberry',
+        'Carrot', 'Tart', 'Dice', 'Turkey', 'Rabbit', 'Invertebrate', 'Vase',
+        'Stool', 'Swim cap', 'Shower', 'Clock', 'Jellyfish', 'Aircraft',
+        'Chopsticks', 'Orange', 'Snake', 'Sewing machine', 'Kangaroo', 'Mixer',
+        'Food processor', 'Shrimp', 'Towel', 'Porcupine', 'Jaguar', 'Cannon',
+        'Limousine', 'Mule', 'Squirrel', 'Kitchen knife', 'Tiara', 'Tiger',
+        'Bow and arrow', 'Candy', 'Rhinoceros', 'Shark', 'Cricket ball',
+        'Doughnut', 'Plumbing fixture', 'Camel', 'Polar bear', 'Coin',
+        'Printer', 'Blender', 'Giraffe', 'Billiard table', 'Kettle',
+        'Dinosaur', 'Pineapple', 'Zucchini', 'Jug', 'Barge', 'Teapot',
+        'Golf ball', 'Binoculars', 'Scissors', 'Hot dog', 'Door handle',
+        'Seahorse', 'Bathtub', 'Leopard', 'Centipede', 'Grapefruit', 'Snowman',
+        'Cheetah', 'Alarm clock', 'Grape', 'Wrench', 'Wok', 'Bell pepper',
+        'Cake stand', 'Barrel', 'Woodpecker', 'Flute', 'Corded phone',
+        'Willow', 'Punching bag', 'Pomegranate', 'Telephone', 'Pear',
+        'Common fig', 'Bench', 'Wood-burning stove', 'Burrito', 'Nail',
+        'Turtle', 'Submarine sandwich', 'Drinking straw', 'Peach', 'Popcorn',
+        'Frying pan', 'Picnic basket', 'Honeycomb', 'Envelope', 'Mango',
+        'Cutting board', 'Pitcher', 'Stationary bicycle', 'Dumbbell',
+        'Personal care', 'Dog bed', 'Snowmobile', 'Oboe', 'Briefcase',
+        'Squash', 'Tick', 'Slow cooker', 'Coffeemaker', 'Measuring cup',
+        'Crutch', 'Stretcher', 'Screwdriver', 'Flashlight', 'Spatula',
+        'Pressure cooker', 'Ring binder', 'Beaker', 'Torch', 'Winter melon'
+    ]
+
+
+def oid_v6_classes():
+    return [
+        'Tortoise', 'Container', 'Magpie', 'Sea turtle', 'Football',
+        'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink', 'Toy',
+        'Organ (Musical Instrument)', 'Cassette deck', 'Apple', 'Human eye',
+        'Cosmetics', 'Paddle', 'Snowman', 'Beer', 'Chopsticks', 'Human beard',
+        'Bird', 'Parking meter', 'Traffic light', 'Croissant', 'Cucumber',
+        'Radish', 'Towel', 'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick',
+        'Belt', 'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle',
+        'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot',
+        'Headphones', 'Hot dog', 'Shorts', 'Fast food', 'Bus', 'Boy',
+        'Screwdriver', 'Bicycle wheel', 'Barge', 'Laptop', 'Miniskirt',
+        'Drill (Tool)', 'Dress', 'Bear', 'Waffle', 'Pancake', 'Brown bear',
+        'Woodpecker', 'Blue jay', 'Pretzel', 'Bagel', 'Tower', 'Teapot',
+        'Person', 'Bow and arrow', 'Swimwear', 'Beehive', 'Brassiere', 'Bee',
+        'Bat (Animal)', 'Starfish', 'Popcorn', 'Burrito', 'Chainsaw',
+        'Balloon', 'Wrench', 'Tent', 'Vehicle registration plate', 'Lantern',
+        'Toaster', 'Flashlight', 'Billboard', 'Tiara', 'Limousine', 'Necklace',
+        'Carnivore', 'Scissors', 'Stairs', 'Computer keyboard', 'Printer',
+        'Traffic sign', 'Chair', 'Shirt', 'Poster', 'Cheese', 'Sock',
+        'Fire hydrant', 'Land vehicle', 'Earrings', 'Tie', 'Watercraft',
+        'Cabinetry', 'Suitcase', 'Muffin', 'Bidet', 'Snack', 'Snowmobile',
+        'Clock', 'Medical equipment', 'Cattle', 'Cello', 'Jet ski', 'Camel',
+        'Coat', 'Suit', 'Desk', 'Cat', 'Bronze sculpture', 'Juice', 'Gondola',
+        'Beetle', 'Cannon', 'Computer mouse', 'Cookie', 'Office building',
+        'Fountain', 'Coin', 'Calculator', 'Cocktail', 'Computer monitor',
+        'Box', 'Stapler', 'Christmas tree', 'Cowboy hat', 'Hiking equipment',
+        'Studio couch', 'Drum', 'Dessert', 'Wine rack', 'Drink', 'Zucchini',
+        'Ladle', 'Human mouth', 'Dairy Product', 'Dice', 'Oven', 'Dinosaur',
+        'Ratchet (Device)', 'Couch', 'Cricket ball', 'Winter melon', 'Spatula',
+        'Whiteboard', 'Pencil sharpener', 'Door', 'Hat', 'Shower', 'Eraser',
+        'Fedora', 'Guacamole', 'Dagger', 'Scarf', 'Dolphin', 'Sombrero',
+        'Tin can', 'Mug', 'Tap', 'Harbor seal', 'Stretcher', 'Can opener',
+        'Goggles', 'Human body', 'Roller skates', 'Coffee cup',
+        'Cutting board', 'Blender', 'Plumbing fixture', 'Stop sign',
+        'Office supplies', 'Volleyball (Ball)', 'Vase', 'Slow cooker',
+        'Wardrobe', 'Coffee', 'Whisk', 'Paper towel', 'Personal care', 'Food',
+        'Sun hat', 'Tree house', 'Flying disc', 'Skirt', 'Gas stove',
+        'Salt and pepper shakers', 'Mechanical fan', 'Face powder', 'Fax',
+        'Fruit', 'French fries', 'Nightstand', 'Barrel', 'Kite', 'Tart',
+        'Treadmill', 'Fox', 'Flag', 'French horn', 'Window blind',
+        'Human foot', 'Golf cart', 'Jacket', 'Egg (Food)', 'Street light',
+        'Guitar', 'Pillow', 'Human leg', 'Isopod', 'Grape', 'Human ear',
+        'Power plugs and sockets', 'Panda', 'Giraffe', 'Woman', 'Door handle',
+        'Rhinoceros', 'Bathtub', 'Goldfish', 'Houseplant', 'Goat',
+        'Baseball bat', 'Baseball glove', 'Mixing bowl',
+        'Marine invertebrates', 'Kitchen utensil', 'Light switch', 'House',
+        'Horse', 'Stationary bicycle', 'Hammer', 'Ceiling fan', 'Sofa bed',
+        'Adhesive tape', 'Harp', 'Sandal', 'Bicycle helmet', 'Saucer',
+        'Harpsichord', 'Human hair', 'Heater', 'Harmonica', 'Hamster',
+        'Curtain', 'Bed', 'Kettle', 'Fireplace', 'Scale', 'Drinking straw',
+        'Insect', 'Hair dryer', 'Kitchenware', 'Indoor rower', 'Invertebrate',
+        'Food processor', 'Bookcase', 'Refrigerator', 'Wood-burning stove',
+        'Punching bag', 'Common fig', 'Cocktail shaker', 'Jaguar (Animal)',
+        'Golf ball', 'Fashion accessory', 'Alarm clock', 'Filing cabinet',
+        'Artichoke', 'Table', 'Tableware', 'Kangaroo', 'Koala', 'Knife',
+        'Bottle', 'Bottle opener', 'Lynx', 'Lavender (Plant)', 'Lighthouse',
+        'Dumbbell', 'Human head', 'Bowl', 'Humidifier', 'Porch', 'Lizard',
+        'Billiard table', 'Mammal', 'Mouse', 'Motorcycle',
+        'Musical instrument', 'Swim cap', 'Frying pan', 'Snowplow',
+        'Bathroom cabinet', 'Missile', 'Bust', 'Man', 'Waffle iron', 'Milk',
+        'Ring binder', 'Plate', 'Mobile phone', 'Baked goods', 'Mushroom',
+        'Crutch', 'Pitcher (Container)', 'Mirror', 'Personal flotation device',
+        'Table tennis racket', 'Pencil case', 'Musical keyboard', 'Scoreboard',
+        'Briefcase', 'Kitchen knife', 'Nail (Construction)', 'Tennis ball',
+        'Plastic bag', 'Oboe', 'Chest of drawers', 'Ostrich', 'Piano', 'Girl',
+        'Plant', 'Potato', 'Hair spray', 'Sports equipment', 'Pasta',
+        'Penguin', 'Pumpkin', 'Pear', 'Infant bed', 'Polar bear', 'Mixer',
+        'Cupboard', 'Jacuzzi', 'Pizza', 'Digital clock', 'Pig', 'Reptile',
+        'Rifle', 'Lipstick', 'Skateboard', 'Raven', 'High heels', 'Red panda',
+        'Rose', 'Rabbit', 'Sculpture', 'Saxophone', 'Shotgun', 'Seafood',
+        'Submarine sandwich', 'Snowboard', 'Sword', 'Picture frame', 'Sushi',
+        'Loveseat', 'Ski', 'Squirrel', 'Tripod', 'Stethoscope', 'Submarine',
+        'Scorpion', 'Segway', 'Training bench', 'Snake', 'Coffee table',
+        'Skyscraper', 'Sheep', 'Television', 'Trombone', 'Tea', 'Tank', 'Taco',
+        'Telephone', 'Torch', 'Tiger', 'Strawberry', 'Trumpet', 'Tree',
+        'Tomato', 'Train', 'Tool', 'Picnic basket', 'Cooking spray',
+        'Trousers', 'Bowling equipment', 'Football helmet', 'Truck',
+        'Measuring cup', 'Coffeemaker', 'Violin', 'Vehicle', 'Handbag',
+        'Paper cutter', 'Wine', 'Weapon', 'Wheel', 'Worm', 'Wok', 'Whale',
+        'Zebra', 'Auto part', 'Jug', 'Pizza cutter', 'Cream', 'Monkey', 'Lion',
+        'Bread', 'Platter', 'Chicken', 'Eagle', 'Helicopter', 'Owl', 'Duck',
+        'Turtle', 'Hippopotamus', 'Crocodile', 'Toilet', 'Toilet paper',
+        'Squid', 'Clothing', 'Footwear', 'Lemon', 'Spider', 'Deer', 'Frog',
+        'Banana', 'Rocket', 'Wine glass', 'Countertop', 'Tablet computer',
+        'Waste container', 'Swimming pool', 'Dog', 'Book', 'Elephant', 'Shark',
+        'Candle', 'Leopard', 'Axe', 'Hand dryer', 'Soap dispenser',
+        'Porcupine', 'Flower', 'Canary', 'Cheetah', 'Palm tree', 'Hamburger',
+        'Maple', 'Building', 'Fish', 'Lobster', 'Garden Asparagus',
+        'Furniture', 'Hedgehog', 'Airplane', 'Spoon', 'Otter', 'Bull',
+        'Oyster', 'Horizontal bar', 'Convenience store', 'Bomb', 'Bench',
+        'Ice cream', 'Caterpillar', 'Butterfly', 'Parachute', 'Orange',
+        'Antelope', 'Beaker', 'Moths and butterflies', 'Window', 'Closet',
+        'Castle', 'Jellyfish', 'Goose', 'Mule', 'Swan', 'Peach', 'Coconut',
+        'Seat belt', 'Raccoon', 'Chisel', 'Fork', 'Lamp', 'Camera',
+        'Squash (Plant)', 'Racket', 'Human face', 'Human arm', 'Vegetable',
+        'Diaper', 'Unicycle', 'Falcon', 'Chime', 'Snail', 'Shellfish',
+        'Cabbage', 'Carrot', 'Mango', 'Jeans', 'Flowerpot', 'Pineapple',
+        'Drawer', 'Stool', 'Envelope', 'Cake', 'Dragonfly', 'Common sunflower',
+        'Microwave oven', 'Honeycomb', 'Marine mammal', 'Sea lion', 'Ladybug',
+        'Shelf', 'Watch', 'Candy', 'Salad', 'Parrot', 'Handgun', 'Sparrow',
+        'Van', 'Grinder', 'Spice rack', 'Light bulb', 'Corded phone',
+        'Sports uniform', 'Tennis racket', 'Wall clock', 'Serving tray',
+        'Kitchen & dining room table', 'Dog bed', 'Cake stand',
+        'Cat furniture', 'Bathroom accessory', 'Facial tissue holder',
+        'Pressure cooker', 'Kitchen appliance', 'Tire', 'Ruler',
+        'Luggage and bags', 'Microphone', 'Broccoli', 'Umbrella', 'Pastry',
+        'Grapefruit', 'Band-aid', 'Animal', 'Bell pepper', 'Turkey', 'Lily',
+        'Pomegranate', 'Doughnut', 'Glasses', 'Human nose', 'Pen', 'Ant',
+        'Car', 'Aircraft', 'Human hand', 'Skunk', 'Teddy bear', 'Watermelon',
+        'Cantaloupe', 'Dishwasher', 'Flute', 'Balance beam', 'Sandwich',
+        'Shrimp', 'Sewing machine', 'Binoculars', 'Rays and skates', 'Ipod',
+        'Accordion', 'Willow', 'Crab', 'Crown', 'Seahorse', 'Perfume',
+        'Alpaca', 'Taxi', 'Canoe', 'Remote control', 'Wheelchair',
+        'Rugby ball', 'Armadillo', 'Maracas', 'Helmet'
+    ]
+
+
+def objects365v1_classes():
+    return [
+        'person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+        'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+        'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes',
+        'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'tv',
+        'storage box', 'vase', 'bench', 'wine glass', 'boots', 'bowl',
+        'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can',
+        'stool', 'backpack', 'couch', 'belt', 'carpet', 'basket',
+        'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table', 'suv',
+        'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone',
+        'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle',
+        'bread', 'high heels', 'ring', 'van', 'watch', 'sink', 'horse', 'fish',
+        'apple', 'camera', 'candle', 'teddy bear', 'cake', 'motorcycle',
+        'wild bird', 'laptop', 'knife', 'traffic sign', 'cell phone', 'paddle',
+        'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus',
+        'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone',
+        'tea pot', 'keyboard', 'tripod', 'hockey', 'fan', 'dog', 'spoon',
+        'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal',
+        'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane',
+        'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote',
+        'baseball glove', 'paper towel', 'refrigerator', 'train', 'tomato',
+        'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone',
+        'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine',
+        'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove',
+        'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat',
+        'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun',
+        'life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck',
+        'sports car', 'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator',
+        'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+        'cutting/chopping board', 'tennis racket', 'candy',
+        'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+        'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+        'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+        'american football', 'basketball', 'potato', 'paint brush', 'printer',
+        'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+        'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+        'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+        'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+        'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+        'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+        'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+        'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+        'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+        'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+        'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+        'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+        'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+        'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+        'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+        'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+        'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+        'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope',
+        'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut',
+        'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly',
+        'egg tart', 'cheese', 'pig', 'pomelo', 'race car', 'rice cooker',
+        'tuba', 'crosswalk sign', 'papaya', 'hair drier', 'green onion',
+        'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill',
+        'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup',
+        'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish',
+        'noodles', 'yak', 'mop', 'crab', 'microscope', 'barbell', 'bread/bun',
+        'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'seal',
+        'mangosteen', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case',
+        'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey',
+        'durian', 'game board', 'rabbit', 'french horn', 'ambulance',
+        'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon',
+        'chainsaw', 'lobster', 'iron', 'flashlight'
+    ]
+
+
+def objects365v2_classes():
+    return [
+        'Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+        'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+        'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+        'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+        'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+        'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+        'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+        'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+        'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+        'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle', 'Guitar',
+        'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', 'Truck',
+        'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', 'Stuffed Toy',
+        'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed', 'Faucet', 'Tent',
+        'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple', 'Air Conditioner',
+        'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck', 'Fork',
+        'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock', 'Pot',
+        'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+        'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+        'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle',
+        'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane',
+        'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage',
+        'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone',
+        'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane',
+        'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat',
+        'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+        'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+        'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+        'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+        'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+        'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+        'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+        'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+        'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', 'Violin',
+        'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck', 'Billards',
+        'Converter', 'Bathtub', 'Wheelchair', 'Golf Club', 'Briefcase',
+        'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear', 'Heavy Truck',
+        'Hamburger', 'Extractor', 'Extention Cord', 'Tong', 'Tennis Racket',
+        'Folder', 'American Football', 'earphone', 'Mask', 'Kettle', 'Tennis',
+        'Ship', 'Swing', 'Coffee Machine', 'Slide', 'Carriage', 'Onion',
+        'Green beans', 'Projector', 'Frisbee',
+        'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+        'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+        'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+        'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+        'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+        'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+        'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+        'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+        'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+        'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+        'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+        'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+        'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+        'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+        'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+        'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+        'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker', 'Tuba',
+        'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal', 'Buttefly',
+        'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', 'Electric Drill',
+        'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', 'Lighter',
+        'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi', 'Target',
+        'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case', 'Yak',
+        'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+        'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+        'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+        'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+        'Table Tennis '
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'],
+    'cityscapes': ['cityscapes'],
+    'oid_challenge': ['oid_challenge', 'openimages_challenge'],
+    'oid_v6': ['oid_v6', 'openimages_v6'],
+    'objects365v1': ['objects365v1', 'obj365v1'],
+    'objects365v2': ['objects365v2', 'obj365v2']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/mmdet/core/evaluation/eval_hooks.py b/mmdet/core/evaluation/eval_hooks.py
new file mode 100755
index 0000000..98856c1
--- /dev/null
+++ b/mmdet/core/evaluation/eval_hooks.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import os.path as osp
+
+import mmcv
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
+    assert mmcv.is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+    return dynamic_milestones, dynamic_intervals
+
+
+class EvalHook(BaseEvalHook):
+
+    def __init__(self, *args, dynamic_intervals=None, **kwargs):
+        super(EvalHook, self).__init__(*args, **kwargs)
+        self.latest_results = None
+
+        self.use_dynamic_intervals = dynamic_intervals is not None
+        if self.use_dynamic_intervals:
+            self.dynamic_milestones, self.dynamic_intervals = \
+                _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+    def _decide_interval(self, runner):
+        if self.use_dynamic_intervals:
+            progress = runner.epoch if self.by_epoch else runner.iter
+            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+            # Dynamically modify the evaluation interval
+            self.interval = self.dynamic_intervals[step - 1]
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        self._decide_interval(runner)
+        super().before_train_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self._decide_interval(runner)
+        super().before_train_iter(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        if not self._should_evaluate(runner):
+            return
+
+        from mmdet.apis import single_gpu_test
+
+        # Changed results to self.results so that MMDetWandbHook can access
+        # the evaluation results and log them to wandb.
+        results = single_gpu_test(runner.model, self.dataloader, show=False)
+        self.latest_results = results
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
+            self._save_ckpt(runner, key_score)
+
+
+# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
+# in order to avoid strong version dependency, we did not directly
+# inherit EvalHook but BaseDistEvalHook.
+class DistEvalHook(BaseDistEvalHook):
+
+    def __init__(self, *args, dynamic_intervals=None, **kwargs):
+        super(DistEvalHook, self).__init__(*args, **kwargs)
+        self.latest_results = None
+
+        self.use_dynamic_intervals = dynamic_intervals is not None
+        if self.use_dynamic_intervals:
+            self.dynamic_milestones, self.dynamic_intervals = \
+                _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+    def _decide_interval(self, runner):
+        if self.use_dynamic_intervals:
+            progress = runner.epoch if self.by_epoch else runner.iter
+            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+            # Dynamically modify the evaluation interval
+            self.interval = self.dynamic_intervals[step - 1]
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        self._decide_interval(runner)
+        super().before_train_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self._decide_interval(runner)
+        super().before_train_iter(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        from mmdet.apis import multi_gpu_test
+
+        # Changed results to self.results so that MMDetWandbHook can access
+        # the evaluation results and log them to wandb.
+        results = multi_gpu_test(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        self.latest_results = results
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+
+            # the key_score may be `None` so it needs to skip
+            # the action to save the best checkpoint
+            if self.save_best and key_score:
+                self._save_ckpt(runner, key_score)
diff --git a/mmdet/core/evaluation/mean_ap.py b/mmdet/core/evaluation/mean_ap.py
new file mode 100755
index 0000000..9568912
--- /dev/null
+++ b/mmdet/core/evaluation/mean_ap.py
@@ -0,0 +1,782 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_bboxes_ignore=None,
+                  default_iou_thr=0.5,
+                  area_ranges=None,
+                  use_legacy_coordinate=False,
+                  **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        default_iou_thr (float): IoU threshold to be considered as matched for
+            medium and large bboxes (small ones have special rules).
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Default: None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes - 1, use_legacy_coordinate=use_legacy_coordinate)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlapped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 iou_thr=0.5,
+                 area_ranges=None,
+                 use_legacy_coordinate=False,
+                 **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Default: None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+    # for each det, the max iou with all gts
+    ious_max = ious.max(axis=1)
+    # for each det, which gt overlaps most with it
+    ious_argmax = ious.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_openimages(det_bboxes,
+                    gt_bboxes,
+                    gt_bboxes_ignore=None,
+                    iou_thr=0.5,
+                    area_ranges=None,
+                    use_legacy_coordinate=False,
+                    gt_bboxes_group_of=None,
+                    use_group_of=True,
+                    ioa_thr=0.5,
+                    **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Default: None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+        gt_bboxes_group_of (ndarray): GT group_of of this image, of shape
+            (k, 1). Default: None
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Default: True.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Default: 0.5.
+
+    Returns:
+        tuple[np.ndarray]: Returns a tuple (tp, fp, det_bboxes), where
+        (tp, fp) whose elements are 0 and 1. The shape of each array is
+        (num_scales, m). (det_bboxes) whose will filter those are not
+        matched by group of gts when processing Open Images evaluation.
+        The shape is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp, det_bboxes
+
+    if gt_bboxes_group_of is not None and use_group_of:
+        # if handle group-of boxes, divided gt boxes into two parts:
+        # non-group-of and group-of.Then calculate ious and ioas through
+        # non-group-of group-of gts respectively. This only used in
+        # OpenImages evaluation.
+        assert gt_bboxes_group_of.shape[0] == gt_bboxes.shape[0]
+        non_group_gt_bboxes = gt_bboxes[~gt_bboxes_group_of]
+        group_gt_bboxes = gt_bboxes[gt_bboxes_group_of]
+        num_gts_group = group_gt_bboxes.shape[0]
+        ious = bbox_overlaps(det_bboxes, non_group_gt_bboxes)
+        ioas = bbox_overlaps(det_bboxes, group_gt_bboxes, mode='iof')
+    else:
+        # if not consider group-of boxes, only calculate ious through gt boxes
+        ious = bbox_overlaps(
+            det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+        ioas = None
+
+    if ious.shape[1] > 0:
+        # for each det, the max iou with all gts
+        ious_max = ious.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ious_argmax = ious.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            gt_covered = np.zeros(num_gts, dtype=bool)
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (
+                    gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                        gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                if ious_max[i] >= iou_thr:
+                    matched_gt = ious_argmax[i]
+                    if not (gt_ignore_inds[matched_gt]
+                            or gt_area_ignore[matched_gt]):
+                        if not gt_covered[matched_gt]:
+                            gt_covered[matched_gt] = True
+                            tp[k, i] = 1
+                        else:
+                            fp[k, i] = 1
+                    # otherwise ignore this detected bbox, tp = 0, fp = 0
+                elif min_area is None:
+                    fp[k, i] = 1
+                else:
+                    bbox = det_bboxes[i, :4]
+                    area = (bbox[2] - bbox[0] + extra_length) * (
+                        bbox[3] - bbox[1] + extra_length)
+                    if area >= min_area and area < max_area:
+                        fp[k, i] = 1
+    else:
+        # if there is no no-group-of gt bboxes in this image,
+        # then all det bboxes within area range are false positives.
+        # Only used in OpenImages evaluation.
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+
+    if ioas is None or ioas.shape[1] <= 0:
+        return tp, fp, det_bboxes
+    else:
+        # The evaluation of group-of TP and FP are done in two stages:
+        # 1. All detections are first matched to non group-of boxes; true
+        #    positives are determined.
+        # 2. Detections that are determined as false positives are matched
+        #    against group-of boxes and calculated group-of TP and FP.
+        # Only used in OpenImages evaluation.
+        det_bboxes_group = np.zeros(
+            (num_scales, ioas.shape[1], det_bboxes.shape[1]), dtype=float)
+        match_group_of = np.zeros((num_scales, num_dets), dtype=bool)
+        tp_group = np.zeros((num_scales, num_gts_group), dtype=np.float32)
+        ioas_max = ioas.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ioas_argmax = ioas.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            box_is_covered = tp[k]
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                    gt_bboxes[:, 3] - gt_bboxes[:, 1])
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                matched_gt = ioas_argmax[i]
+                if not box_is_covered[i]:
+                    if ioas_max[i] >= ioa_thr:
+                        if not (gt_ignore_inds[matched_gt]
+                                or gt_area_ignore[matched_gt]):
+                            if not tp_group[k, matched_gt]:
+                                tp_group[k, matched_gt] = 1
+                                match_group_of[k, i] = True
+                            else:
+                                match_group_of[k, i] = True
+
+                            if det_bboxes_group[k, matched_gt, -1] < \
+                                    det_bboxes[i, -1]:
+                                det_bboxes_group[k, matched_gt] = \
+                                    det_bboxes[i]
+
+        fp_group = (tp_group <= 0).astype(float)
+        tps = []
+        fps = []
+        # concatenate tp, fp, and det-boxes which not matched group of
+        # gt boxes and tp_group, fp_group, and det_bboxes_group which
+        # matched group of boxes respectively.
+        for i in range(num_scales):
+            tps.append(
+                np.concatenate((tp[i][~match_group_of[i]], tp_group[i])))
+            fps.append(
+                np.concatenate((fp[i][~match_group_of[i]], fp_group[i])))
+            det_bboxes = np.concatenate(
+                (det_bboxes[~match_group_of[i]], det_bboxes_group[i]))
+
+        tp = np.vstack(tps)
+        fp = np.vstack(fps)
+        return tp, fp, det_bboxes
+
+
+def get_cls_results(det_results, annotations, class_id):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        det_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
+    """
+    cls_dets = [img_res[class_id] for img_res in det_results]
+    cls_gts = []
+    cls_gts_ignore = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        cls_gts.append(ann['bboxes'][gt_inds, :])
+
+        if ann.get('labels_ignore', None) is not None:
+            ignore_inds = ann['labels_ignore'] == class_id
+            cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
+        else:
+            cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
+
+    return cls_dets, cls_gts, cls_gts_ignore
+
+
+def get_cls_group_ofs(annotations, class_id):
+    """Get `gt_group_of` of a certain class, which is used in Open Images.
+
+    Args:
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        list[np.ndarray]: `gt_group_of` of a certain class.
+    """
+    gt_group_ofs = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        if ann.get('gt_is_group_ofs', None) is not None:
+            gt_group_ofs.append(ann['gt_is_group_ofs'][gt_inds])
+        else:
+            gt_group_ofs.append(np.empty((0, 1), dtype=bool))
+
+    return gt_group_ofs
+
+
+def eval_map(det_results,
+             annotations,
+             scale_ranges=None,
+             iou_thr=0.5,
+             ioa_thr=None,
+             dataset=None,
+             logger=None,
+             tpfp_fn=None,
+             nproc=4,
+             use_legacy_coordinate=False,
+             use_group_of=False):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotations (list[dict]): Ground truth annotations where each item of
+            the list indicates an image. Keys of annotations are:
+
+            - `bboxes`: numpy array of shape (n, 4)
+            - `labels`: numpy array of shape (n, )
+            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
+            - `labels_ignore` (optional): numpy array of shape (k, )
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. A range of
+            (32, 64) means the area range between (32**2, 64**2).
+            Default: None.
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Default: None.
+        dataset (list[str] | str | None): Dataset name or dataset classes,
+            there are minor differences in metrics for different datasets, e.g.
+            "voc07", "imagenet_det", etc. Default: None.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+        tpfp_fn (callable | None): The function used to determine true/
+            false positives. If None, :func:`tpfp_default` is used as default
+            unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
+            case). If it is given as a function, then this function is used
+            to evaluate tp & fp. Default None.
+        nproc (int): Processes used for computing TP and FP.
+            Default: 4.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Default: False.
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(annotations)
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    num_imgs = len(det_results)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    num_classes = len(det_results[0])  # positive class num
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+
+    # There is no need to use multi processes to process
+    # when num_imgs = 1 .
+    if num_imgs > 1:
+        assert nproc > 0, 'nproc must be at least one.'
+        nproc = min(nproc, num_imgs)
+        pool = Pool(nproc)
+
+    eval_results = []
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
+            det_results, annotations, i)
+        # choose proper function according to datasets to compute tp and fp
+        if tpfp_fn is None:
+            if dataset in ['det', 'vid']:
+                tpfp_fn = tpfp_imagenet
+            elif dataset in ['oid_challenge', 'oid_v6'] \
+                    or use_group_of is True:
+                tpfp_fn = tpfp_openimages
+            else:
+                tpfp_fn = tpfp_default
+        if not callable(tpfp_fn):
+            raise ValueError(
+                f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
+
+        if num_imgs > 1:
+            # compute tp and fp for each image with multiple processes
+            args = []
+            if use_group_of:
+                # used in Open Images Dataset evaluation
+                gt_group_ofs = get_cls_group_ofs(annotations, i)
+                args.append(gt_group_ofs)
+                args.append([use_group_of for _ in range(num_imgs)])
+            if ioa_thr is not None:
+                args.append([ioa_thr for _ in range(num_imgs)])
+
+            tpfp = pool.starmap(
+                tpfp_fn,
+                zip(cls_dets, cls_gts, cls_gts_ignore,
+                    [iou_thr for _ in range(num_imgs)],
+                    [area_ranges for _ in range(num_imgs)],
+                    [use_legacy_coordinate for _ in range(num_imgs)], *args))
+        else:
+            tpfp = tpfp_fn(
+                cls_dets[0],
+                cls_gts[0],
+                cls_gts_ignore[0],
+                iou_thr,
+                area_ranges,
+                use_legacy_coordinate,
+                gt_bboxes_group_of=(get_cls_group_ofs(annotations, i)[0]
+                                    if use_group_of else None),
+                use_group_of=use_group_of,
+                ioa_thr=ioa_thr)
+            tpfp = [tpfp]
+
+        if use_group_of:
+            tp, fp, cls_dets = tuple(zip(*tpfp))
+        else:
+            tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale
+        # ignored gts or gts beyond the specific scale are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += bbox.shape[0]
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0] + extra_length) * (
+                    bbox[:, 3] - bbox[:, 1] + extra_length)
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum((gt_areas >= min_area)
+                                         & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        mode = 'area' if dataset != 'voc07' else '11points'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+
+    if num_imgs > 1:
+        pool.close()
+
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, dataset, area_ranges, logger=logger)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      dataset=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(num_classes)]
+    elif mmcv.is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/mmdet/core/evaluation/panoptic_utils.py b/mmdet/core/evaluation/panoptic_utils.py
new file mode 100755
index 0000000..10c9ad9
--- /dev/null
+++ b/mmdet/core/evaluation/panoptic_utils.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# A custom value to distinguish instance ID and category ID; need to
+# be greater than the number of categories.
+# For a pixel in the panoptic result map:
+#   pan_id = ins_id * INSTANCE_OFFSET + cat_id
+INSTANCE_OFFSET = 1000
diff --git a/mmdet/core/evaluation/recall.py b/mmdet/core/evaluation/recall.py
new file mode 100755
index 0000000..82b3c90
--- /dev/null
+++ b/mmdet/core/evaluation/recall.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sequence
+
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format."""
+    if isinstance(proposal_nums, Sequence):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, Sequence):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=0.5,
+                 logger=None,
+                 use_legacy_coordinate=False):
+    """Calculate recalls.
+
+    Args:
+        gts (list[ndarray]): a list of arrays of shape (n, 4)
+        proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums (int | Sequence[int]): Top N proposals to be evaluated.
+        iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5.
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+        use_legacy_coordinate (bool): Whether use coordinate system
+            in mmdet v1.x. "1" was added to both height and width
+            which means w, h should be
+            computed as 'x2 - x1 + 1` and 'y2 - y1 + 1'. Default: False.
+
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(
+                gts[i],
+                img_proposal[:prop_num, :4],
+                use_legacy_coordinate=use_legacy_coordinate)
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+
+    print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None,
+                         logger=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls (ndarray): calculated from `bbox_recalls`
+        proposal_nums (ndarray or list): top N proposals
+        iou_thrs (ndarray or list): iou thresholds
+        row_idxs (ndarray): which rows(proposal nums) to print
+        col_idxs (ndarray): which cols(iou thresholds) to print
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print_log('\n' + table.table, logger=logger)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/mmdet/core/export/__init__.py b/mmdet/core/export/__init__.py
new file mode 100755
index 0000000..a8179c9
--- /dev/null
+++ b/mmdet/core/export/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .onnx_helper import (add_dummy_nms_for_onnx, dynamic_clip_for_onnx,
+                          get_k_for_topk)
+from .pytorch2onnx import (build_model_from_cfg,
+                           generate_inputs_and_wrap_model,
+                           preprocess_example_input)
+
+__all__ = [
+    'build_model_from_cfg', 'generate_inputs_and_wrap_model',
+    'preprocess_example_input', 'get_k_for_topk', 'add_dummy_nms_for_onnx',
+    'dynamic_clip_for_onnx'
+]
diff --git a/mmdet/core/export/model_wrappers.py b/mmdet/core/export/model_wrappers.py
new file mode 100755
index 0000000..c7be2df
--- /dev/null
+++ b/mmdet/core/export/model_wrappers.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+
+import numpy as np
+import torch
+
+from mmdet.core import bbox2result
+from mmdet.models import BaseDetector
+
+
+class DeployBaseDetector(BaseDetector):
+    """DeployBaseDetector."""
+
+    def __init__(self, class_names, device_id):
+        super(DeployBaseDetector, self).__init__()
+        self.CLASSES = class_names
+        self.device_id = device_id
+
+    def simple_test(self, img, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def extract_feat(self, imgs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def forward_train(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def val_step(self, data, optimizer):
+        raise NotImplementedError('This method is not implemented.')
+
+    def train_step(self, data, optimizer):
+        raise NotImplementedError('This method is not implemented.')
+
+    def forward_test(self, *, img, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def async_simple_test(self, img, img_metas, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        outputs = self.forward_test(img, img_metas, **kwargs)
+        batch_dets, batch_labels = outputs[:2]
+        batch_masks = outputs[2] if len(outputs) == 3 else None
+        batch_size = img[0].shape[0]
+        img_metas = img_metas[0]
+        results = []
+        rescale = kwargs.get('rescale', True)
+        for i in range(batch_size):
+            dets, labels = batch_dets[i], batch_labels[i]
+            if rescale:
+                scale_factor = img_metas[i]['scale_factor']
+
+                if isinstance(scale_factor, (list, tuple, np.ndarray)):
+                    assert len(scale_factor) == 4
+                    scale_factor = np.array(scale_factor)[None, :]  # [1,4]
+                dets[:, :4] /= scale_factor
+
+            if 'border' in img_metas[i]:
+                # offset pixel of the top-left corners between original image
+                # and padded/enlarged image, 'border' is used when exporting
+                # CornerNet and CentripetalNet to onnx
+                x_off = img_metas[i]['border'][2]
+                y_off = img_metas[i]['border'][0]
+                dets[:, [0, 2]] -= x_off
+                dets[:, [1, 3]] -= y_off
+                dets[:, :4] *= (dets[:, :4] > 0).astype(dets.dtype)
+
+            dets_results = bbox2result(dets, labels, len(self.CLASSES))
+
+            if batch_masks is not None:
+                masks = batch_masks[i]
+                img_h, img_w = img_metas[i]['img_shape'][:2]
+                ori_h, ori_w = img_metas[i]['ori_shape'][:2]
+                masks = masks[:, :img_h, :img_w]
+                if rescale:
+                    masks = masks.astype(np.float32)
+                    masks = torch.from_numpy(masks)
+                    masks = torch.nn.functional.interpolate(
+                        masks.unsqueeze(0), size=(ori_h, ori_w))
+                    masks = masks.squeeze(0).detach().numpy()
+                if masks.dtype != bool:
+                    masks = masks >= 0.5
+                segms_results = [[] for _ in range(len(self.CLASSES))]
+                for j in range(len(dets)):
+                    segms_results[labels[j]].append(masks[j])
+                results.append((dets_results, segms_results))
+            else:
+                results.append(dets_results)
+        return results
+
+
+class ONNXRuntimeDetector(DeployBaseDetector):
+    """Wrapper for detector's inference with ONNXRuntime."""
+
+    def __init__(self, onnx_file, class_names, device_id):
+        super(ONNXRuntimeDetector, self).__init__(class_names, device_id)
+        import onnxruntime as ort
+
+        # get the custom op path
+        ort_custom_op_path = ''
+        try:
+            from mmcv.ops import get_onnxruntime_op_path
+            ort_custom_op_path = get_onnxruntime_op_path()
+        except (ImportError, ModuleNotFoundError):
+            warnings.warn('If input model has custom op from mmcv, \
+                you may have to build mmcv with ONNXRuntime from source.')
+        session_options = ort.SessionOptions()
+        # register custom op for onnxruntime
+        if osp.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+        sess = ort.InferenceSession(onnx_file, session_options)
+        providers = ['CPUExecutionProvider']
+        options = [{}]
+        is_cuda_available = ort.get_device() == 'GPU'
+        if is_cuda_available:
+            providers.insert(0, 'CUDAExecutionProvider')
+            options.insert(0, {'device_id': device_id})
+
+        sess.set_providers(providers, options)
+
+        self.sess = sess
+        self.io_binding = sess.io_binding()
+        self.output_names = [_.name for _ in sess.get_outputs()]
+        self.is_cuda_available = is_cuda_available
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        input_data = imgs[0]
+        # set io binding for inputs/outputs
+        device_type = 'cuda' if self.is_cuda_available else 'cpu'
+        if not self.is_cuda_available:
+            input_data = input_data.cpu()
+        self.io_binding.bind_input(
+            name='input',
+            device_type=device_type,
+            device_id=self.device_id,
+            element_type=np.float32,
+            shape=input_data.shape,
+            buffer_ptr=input_data.data_ptr())
+
+        for name in self.output_names:
+            self.io_binding.bind_output(name)
+        # run session to get outputs
+        self.sess.run_with_iobinding(self.io_binding)
+        ort_outputs = self.io_binding.copy_outputs_to_cpu()
+        return ort_outputs
+
+
+class TensorRTDetector(DeployBaseDetector):
+    """Wrapper for detector's inference with TensorRT."""
+
+    def __init__(self, engine_file, class_names, device_id, output_names=None):
+        super(TensorRTDetector, self).__init__(class_names, device_id)
+        warnings.warn('`output_names` is deprecated and will be removed in '
+                      'future releases.')
+        from mmcv.tensorrt import TRTWraper, load_tensorrt_plugin
+        try:
+            load_tensorrt_plugin()
+        except (ImportError, ModuleNotFoundError):
+            warnings.warn('If input model has custom op from mmcv, \
+                you may have to build mmcv with TensorRT from source.')
+
+        output_names = ['dets', 'labels']
+        model = TRTWraper(engine_file, ['input'], output_names)
+        with_masks = False
+        # if TensorRT has totally 4 inputs/outputs, then
+        # the detector should have `mask` output.
+        if len(model.engine) == 4:
+            model.output_names = output_names + ['masks']
+            with_masks = True
+        self.model = model
+        self.with_masks = with_masks
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        input_data = imgs[0].contiguous()
+        with torch.cuda.device(self.device_id), torch.no_grad():
+            outputs = self.model({'input': input_data})
+            outputs = [outputs[name] for name in self.model.output_names]
+        outputs = [out.detach().cpu().numpy() for out in outputs]
+        return outputs
diff --git a/mmdet/core/export/onnx_helper.py b/mmdet/core/export/onnx_helper.py
new file mode 100755
index 0000000..9f6b9a0
--- /dev/null
+++ b/mmdet/core/export/onnx_helper.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import torch
+
+
+def dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape):
+    """Clip boxes dynamically for onnx.
+
+    Since torch.clamp cannot have dynamic `min` and `max`, we scale the
+      boxes by 1/max_shape and clamp in the range [0, 1].
+
+    Args:
+        x1 (Tensor): The x1 for bounding boxes.
+        y1 (Tensor): The y1 for bounding boxes.
+        x2 (Tensor): The x2 for bounding boxes.
+        y2 (Tensor): The y2 for bounding boxes.
+        max_shape (Tensor or torch.Size): The (H,W) of original image.
+    Returns:
+        tuple(Tensor): The clipped x1, y1, x2, y2.
+    """
+    assert isinstance(
+        max_shape,
+        torch.Tensor), '`max_shape` should be tensor of (h,w) for onnx'
+
+    # scale by 1/max_shape
+    x1 = x1 / max_shape[1]
+    y1 = y1 / max_shape[0]
+    x2 = x2 / max_shape[1]
+    y2 = y2 / max_shape[0]
+
+    # clamp [0, 1]
+    x1 = torch.clamp(x1, 0, 1)
+    y1 = torch.clamp(y1, 0, 1)
+    x2 = torch.clamp(x2, 0, 1)
+    y2 = torch.clamp(y2, 0, 1)
+
+    # scale back
+    x1 = x1 * max_shape[1]
+    y1 = y1 * max_shape[0]
+    x2 = x2 * max_shape[1]
+    y2 = y2 * max_shape[0]
+    return x1, y1, x2, y2
+
+
+def get_k_for_topk(k, size):
+    """Get k of TopK for onnx exporting.
+
+    The K of TopK in TensorRT should not be a Tensor, while in ONNX Runtime
+      it could be a Tensor.Due to dynamic shape feature, we have to decide
+      whether to do TopK and what K it should be while exporting to ONNX.
+    If returned K is less than zero, it means we do not have to do
+      TopK operation.
+
+    Args:
+        k (int or Tensor): The set k value for nms from config file.
+        size (Tensor or torch.Size): The number of elements of \
+            TopK's input tensor
+    Returns:
+        tuple: (int or Tensor): The final K for TopK.
+    """
+    ret_k = -1
+    if k <= 0 or size <= 0:
+        return ret_k
+    if torch.onnx.is_in_onnx_export():
+        is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
+        if is_trt_backend:
+            # TensorRT does not support dynamic K with TopK op
+            if 0 < k < size:
+                ret_k = k
+        else:
+            # Always keep topk op for dynamic input in onnx for ONNX Runtime
+            ret_k = torch.where(k < size, k, size)
+    elif k < size:
+        ret_k = k
+    else:
+        # ret_k is -1
+        pass
+    return ret_k
+
+
+def add_dummy_nms_for_onnx(boxes,
+                           scores,
+                           max_output_boxes_per_class=1000,
+                           iou_threshold=0.5,
+                           score_threshold=0.05,
+                           pre_top_k=-1,
+                           after_top_k=-1,
+                           labels=None):
+    """Create a dummy onnx::NonMaxSuppression op while exporting to ONNX.
+
+    This function helps exporting to onnx with batch and multiclass NMS op.
+    It only supports class-agnostic detection results. That is, the scores
+    is of shape (N, num_bboxes, num_classes) and the boxes is of shape
+    (N, num_boxes, 4).
+
+    Args:
+        boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]
+        scores (Tensor): The detection scores of shape
+            [N, num_boxes, num_classes]
+        max_output_boxes_per_class (int): Maximum number of output
+            boxes per class of nms. Defaults to 1000.
+        iou_threshold (float): IOU threshold of nms. Defaults to 0.5
+        score_threshold (float): score threshold of nms.
+            Defaults to 0.05.
+        pre_top_k (bool): Number of top K boxes to keep before nms.
+            Defaults to -1.
+        after_top_k (int): Number of top K boxes to keep after nms.
+            Defaults to -1.
+        labels (Tensor, optional): It not None, explicit labels would be used.
+            Otherwise, labels would be automatically generated using
+            num_classed. Defaults to None.
+
+    Returns:
+        tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+            and class labels of shape [N, num_det].
+    """
+    max_output_boxes_per_class = torch.LongTensor([max_output_boxes_per_class])
+    iou_threshold = torch.tensor([iou_threshold], dtype=torch.float32)
+    score_threshold = torch.tensor([score_threshold], dtype=torch.float32)
+    batch_size = scores.shape[0]
+    num_class = scores.shape[2]
+
+    nms_pre = torch.tensor(pre_top_k, device=scores.device, dtype=torch.long)
+    nms_pre = get_k_for_topk(nms_pre, boxes.shape[1])
+
+    if nms_pre > 0:
+        max_scores, _ = scores.max(-1)
+        _, topk_inds = max_scores.topk(nms_pre)
+        batch_inds = torch.arange(batch_size).view(
+            -1, 1).expand_as(topk_inds).long()
+        # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
+        transformed_inds = boxes.shape[1] * batch_inds + topk_inds
+        boxes = boxes.reshape(-1, 4)[transformed_inds, :].reshape(
+            batch_size, -1, 4)
+        scores = scores.reshape(-1, num_class)[transformed_inds, :].reshape(
+            batch_size, -1, num_class)
+        if labels is not None:
+            labels = labels.reshape(-1, 1)[transformed_inds].reshape(
+                batch_size, -1)
+
+    scores = scores.permute(0, 2, 1)
+    num_box = boxes.shape[1]
+    # turn off tracing to create a dummy output of nms
+    state = torch._C._get_tracing_state()
+    # dummy indices of nms's output
+    num_fake_det = 2
+    batch_inds = torch.randint(batch_size, (num_fake_det, 1))
+    cls_inds = torch.randint(num_class, (num_fake_det, 1))
+    box_inds = torch.randint(num_box, (num_fake_det, 1))
+    indices = torch.cat([batch_inds, cls_inds, box_inds], dim=1)
+    output = indices
+    setattr(DummyONNXNMSop, 'output', output)
+
+    # open tracing
+    torch._C._set_tracing_state(state)
+    selected_indices = DummyONNXNMSop.apply(boxes, scores,
+                                            max_output_boxes_per_class,
+                                            iou_threshold, score_threshold)
+
+    batch_inds, cls_inds = selected_indices[:, 0], selected_indices[:, 1]
+    box_inds = selected_indices[:, 2]
+    if labels is None:
+        labels = torch.arange(num_class, dtype=torch.long).to(scores.device)
+        labels = labels.view(1, num_class, 1).expand_as(scores)
+    scores = scores.reshape(-1, 1)
+    boxes = boxes.reshape(batch_size, -1).repeat(1, num_class).reshape(-1, 4)
+    pos_inds = (num_class * batch_inds + cls_inds) * num_box + box_inds
+    mask = scores.new_zeros(scores.shape)
+    # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
+    # PyTorch style code: mask[batch_inds, box_inds] += 1
+    mask[pos_inds, :] += 1
+    scores = scores * mask
+    boxes = boxes * mask
+
+    scores = scores.reshape(batch_size, -1)
+    boxes = boxes.reshape(batch_size, -1, 4)
+    labels = labels.reshape(batch_size, -1)
+
+    nms_after = torch.tensor(
+        after_top_k, device=scores.device, dtype=torch.long)
+    nms_after = get_k_for_topk(nms_after, num_box * num_class)
+
+    if nms_after > 0:
+        _, topk_inds = scores.topk(nms_after)
+        batch_inds = torch.arange(batch_size).view(-1, 1).expand_as(topk_inds)
+        # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
+        transformed_inds = scores.shape[1] * batch_inds + topk_inds
+        scores = scores.reshape(-1, 1)[transformed_inds, :].reshape(
+            batch_size, -1)
+        boxes = boxes.reshape(-1, 4)[transformed_inds, :].reshape(
+            batch_size, -1, 4)
+        labels = labels.reshape(-1, 1)[transformed_inds, :].reshape(
+            batch_size, -1)
+
+    scores = scores.unsqueeze(2)
+    dets = torch.cat([boxes, scores], dim=2)
+    return dets, labels
+
+
+class DummyONNXNMSop(torch.autograd.Function):
+    """DummyONNXNMSop.
+
+    This class is only for creating onnx::NonMaxSuppression.
+    """
+
+    @staticmethod
+    def forward(ctx, boxes, scores, max_output_boxes_per_class, iou_threshold,
+                score_threshold):
+
+        return DummyONNXNMSop.output
+
+    @staticmethod
+    def symbolic(g, boxes, scores, max_output_boxes_per_class, iou_threshold,
+                 score_threshold):
+        return g.op(
+            'NonMaxSuppression',
+            boxes,
+            scores,
+            max_output_boxes_per_class,
+            iou_threshold,
+            score_threshold,
+            outputs=1)
diff --git a/mmdet/core/export/pytorch2onnx.py b/mmdet/core/export/pytorch2onnx.py
new file mode 100755
index 0000000..b8261ee
--- /dev/null
+++ b/mmdet/core/export/pytorch2onnx.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.runner import load_checkpoint
+
+
+def generate_inputs_and_wrap_model(config_path,
+                                   checkpoint_path,
+                                   input_config,
+                                   cfg_options=None):
+    """Prepare sample input and wrap model for ONNX export.
+
+    The ONNX export API only accept args, and all inputs should be
+    torch.Tensor or corresponding types (such as tuple of tensor).
+    So we should call this function before exporting. This function will:
+
+    1. generate corresponding inputs which are used to execute the model.
+    2. Wrap the model's forward function.
+
+    For example, the MMDet models' forward function has a parameter
+    ``return_loss:bool``. As we want to set it as False while export API
+    supports neither bool type or kwargs. So we have to replace the forward
+    method like ``model.forward = partial(model.forward, return_loss=False)``.
+
+    Args:
+        config_path (str): the OpenMMLab config for the model we want to
+            export to ONNX
+        checkpoint_path (str): Path to the corresponding checkpoint
+        input_config (dict): the exactly data in this dict depends on the
+            framework. For MMSeg, we can just declare the input shape,
+            and generate the dummy data accordingly. However, for MMDet,
+            we may pass the real img path, or the NMS will return None
+            as there is no legal bbox.
+
+    Returns:
+        tuple: (model, tensor_data) wrapped model which can be called by
+            ``model(*tensor_data)`` and a list of inputs which are used to
+            execute the model while exporting.
+    """
+
+    model = build_model_from_cfg(
+        config_path, checkpoint_path, cfg_options=cfg_options)
+    one_img, one_meta = preprocess_example_input(input_config)
+    tensor_data = [one_img]
+    model.forward = partial(
+        model.forward, img_metas=[[one_meta]], return_loss=False)
+
+    # pytorch has some bug in pytorch1.3, we have to fix it
+    # by replacing these existing op
+    opset_version = 11
+    # put the import within the function thus it will not cause import error
+    # when not using this function
+    try:
+        from mmcv.onnx.symbolic import register_extra_symbolics
+    except ModuleNotFoundError:
+        raise NotImplementedError('please update mmcv to version>=v1.0.4')
+    register_extra_symbolics(opset_version)
+
+    return model, tensor_data
+
+
+def build_model_from_cfg(config_path, checkpoint_path, cfg_options=None):
+    """Build a model from config and load the given checkpoint.
+
+    Args:
+        config_path (str): the OpenMMLab config for the model we want to
+            export to ONNX
+        checkpoint_path (str): Path to the corresponding checkpoint
+
+    Returns:
+        torch.nn.Module: the built model
+    """
+    from mmdet.models import build_detector
+
+    cfg = mmcv.Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the model
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    checkpoint = load_checkpoint(model, checkpoint_path, map_location='cpu')
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        from mmdet.datasets import DATASETS
+        dataset = DATASETS.get(cfg.data.test['type'])
+        assert (dataset is not None)
+        model.CLASSES = dataset.CLASSES
+    model.cpu().eval()
+    return model
+
+
+def preprocess_example_input(input_config):
+    """Prepare an example input image for ``generate_inputs_and_wrap_model``.
+
+    Args:
+        input_config (dict): customized config describing the example input.
+
+    Returns:
+        tuple: (one_img, one_meta), tensor of the example input image and \
+            meta information for the example input image.
+
+    Examples:
+        >>> from mmdet.core.export import preprocess_example_input
+        >>> input_config = {
+        >>>         'input_shape': (1,3,224,224),
+        >>>         'input_path': 'demo/demo.jpg',
+        >>>         'normalize_cfg': {
+        >>>             'mean': (123.675, 116.28, 103.53),
+        >>>             'std': (58.395, 57.12, 57.375)
+        >>>             }
+        >>>         }
+        >>> one_img, one_meta = preprocess_example_input(input_config)
+        >>> print(one_img.shape)
+        torch.Size([1, 3, 224, 224])
+        >>> print(one_meta)
+        {'img_shape': (224, 224, 3),
+        'ori_shape': (224, 224, 3),
+        'pad_shape': (224, 224, 3),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False}
+    """
+    input_path = input_config['input_path']
+    input_shape = input_config['input_shape']
+    one_img = mmcv.imread(input_path)
+    one_img = mmcv.imresize(one_img, input_shape[2:][::-1])
+    show_img = one_img.copy()
+    if 'normalize_cfg' in input_config.keys():
+        normalize_cfg = input_config['normalize_cfg']
+        mean = np.array(normalize_cfg['mean'], dtype=np.float32)
+        std = np.array(normalize_cfg['std'], dtype=np.float32)
+        to_rgb = normalize_cfg.get('to_rgb', True)
+        one_img = mmcv.imnormalize(one_img, mean, std, to_rgb=to_rgb)
+    one_img = one_img.transpose(2, 0, 1)
+    one_img = torch.from_numpy(one_img).unsqueeze(0).float().requires_grad_(
+        True)
+    (_, C, H, W) = input_shape
+    one_meta = {
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': np.ones(4, dtype=np.float32),
+        'flip': False,
+        'show_img': show_img,
+        'flip_direction': None
+    }
+
+    return one_img, one_meta
diff --git a/mmdet/core/hook/__init__.py b/mmdet/core/hook/__init__.py
new file mode 100755
index 0000000..7b9ac9f
--- /dev/null
+++ b/mmdet/core/hook/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkloss_hook import CheckInvalidLossHook
+from .ema import ExpMomentumEMAHook, LinearMomentumEMAHook
+from .memory_profiler_hook import MemoryProfilerHook
+from .set_epoch_info_hook import SetEpochInfoHook
+from .sync_norm_hook import SyncNormHook
+from .sync_random_size_hook import SyncRandomSizeHook
+from .wandblogger_hook import MMDetWandbHook
+from .yolox_lrupdater_hook import YOLOXLrUpdaterHook
+from .yolox_mode_switch_hook import YOLOXModeSwitchHook
+
+__all__ = [
+    'SyncRandomSizeHook', 'YOLOXModeSwitchHook', 'SyncNormHook',
+    'ExpMomentumEMAHook', 'LinearMomentumEMAHook', 'YOLOXLrUpdaterHook',
+    'CheckInvalidLossHook', 'SetEpochInfoHook', 'MemoryProfilerHook',
+    'MMDetWandbHook'
+]
diff --git a/mmdet/core/hook/checkloss_hook.py b/mmdet/core/hook/checkloss_hook.py
new file mode 100755
index 0000000..d147ea8
--- /dev/null
+++ b/mmdet/core/hook/checkloss_hook.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner.hooks import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class CheckInvalidLossHook(Hook):
+    """Check invalid loss hook.
+
+    This hook will regularly check whether the loss is valid
+    during training.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval=50):
+        self.interval = interval
+
+    def after_train_iter(self, runner):
+        if self.every_n_iters(runner, self.interval):
+            if not torch.isfinite(runner.outputs['loss']):
+                breakpoint()
+            assert torch.isfinite(runner.outputs['loss']), \
+                runner.logger.info('loss become infinite or NaN!')
diff --git a/mmdet/core/hook/ema.py b/mmdet/core/hook/ema.py
new file mode 100755
index 0000000..ff7bfba
--- /dev/null
+++ b/mmdet/core/hook/ema.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner.hooks import HOOKS, Hook
+
+
+class BaseEMAHook(Hook):
+    """Exponential Moving Average Hook.
+
+    Use Exponential Moving Average on all parameters of model in training
+    process. All parameters have a ema backup, which update by the formula
+    as below. EMAHook takes priority over EvalHook and CheckpointHook. Note,
+    the original model parameters are actually saved in ema field after train.
+
+    Args:
+        momentum (float): The momentum used for updating ema parameter.
+            Ema's parameter are updated with the formula:
+           `ema_param = (1-momentum) * ema_param + momentum * cur_param`.
+            Defaults to 0.0002.
+        skip_buffers (bool): Whether to skip the model buffers, such as
+            batchnorm running stats (running_mean, running_var), it does not
+            perform the ema operation. Default to False.
+        interval (int): Update ema parameter every interval iteration.
+            Defaults to 1.
+        resume_from (str, optional): The checkpoint path. Defaults to None.
+        momentum_fun (func, optional): The function to change momentum
+            during early iteration (also warmup) to help early training.
+            It uses `momentum` as a constant. Defaults to None.
+    """
+
+    def __init__(self,
+                 momentum=0.0002,
+                 interval=1,
+                 skip_buffers=False,
+                 resume_from=None,
+                 momentum_fun=None):
+        assert 0 < momentum < 1
+        self.momentum = momentum
+        self.skip_buffers = skip_buffers
+        self.interval = interval
+        self.checkpoint = resume_from
+        self.momentum_fun = momentum_fun
+
+    def before_run(self, runner):
+        """To resume model with it's ema parameters more friendly.
+
+        Register ema parameter as ``named_buffer`` to model.
+        """
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        self.param_ema_buffer = {}
+        if self.skip_buffers:
+            self.model_parameters = dict(model.named_parameters())
+        else:
+            self.model_parameters = model.state_dict()
+        for name, value in self.model_parameters.items():
+            # "." is not allowed in module's buffer name
+            buffer_name = f"ema_{name.replace('.', '_')}"
+            self.param_ema_buffer[name] = buffer_name
+            model.register_buffer(buffer_name, value.data.clone())
+        self.model_buffers = dict(model.named_buffers())
+        if self.checkpoint is not None:
+            runner.resume(self.checkpoint)
+
+    def get_momentum(self, runner):
+        return self.momentum_fun(runner.iter) if self.momentum_fun else \
+                        self.momentum
+
+    def after_train_iter(self, runner):
+        """Update ema parameter every self.interval iterations."""
+        if (runner.iter + 1) % self.interval != 0:
+            return
+        momentum = self.get_momentum(runner)
+        for name, parameter in self.model_parameters.items():
+            # exclude num_tracking
+            if parameter.dtype.is_floating_point:
+                buffer_name = self.param_ema_buffer[name]
+                buffer_parameter = self.model_buffers[buffer_name]
+                buffer_parameter.mul_(1 - momentum).add_(
+                    parameter.data, alpha=momentum)
+
+    def after_train_epoch(self, runner):
+        """We load parameter values from ema backup to model before the
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def before_train_epoch(self, runner):
+        """We recover model's parameter from ema backup after last epoch's
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def _swap_ema_parameters(self):
+        """Swap the parameter of model with parameter in ema_buffer."""
+        for name, value in self.model_parameters.items():
+            temp = value.data.clone()
+            ema_buffer = self.model_buffers[self.param_ema_buffer[name]]
+            value.data.copy_(ema_buffer.data)
+            ema_buffer.data.copy_(temp)
+
+
+@HOOKS.register_module()
+class ExpMomentumEMAHook(BaseEMAHook):
+    """EMAHook using exponential momentum strategy.
+
+    Args:
+        total_iter (int): The total number of iterations of EMA momentum.
+           Defaults to 2000.
+    """
+
+    def __init__(self, total_iter=2000, **kwargs):
+        super(ExpMomentumEMAHook, self).__init__(**kwargs)
+        self.momentum_fun = lambda x: (1 - self.momentum) * math.exp(-(
+            1 + x) / total_iter) + self.momentum
+
+
+@HOOKS.register_module()
+class LinearMomentumEMAHook(BaseEMAHook):
+    """EMAHook using linear momentum strategy.
+
+    Args:
+        warm_up (int): During first warm_up steps, we may use smaller decay
+            to update ema parameters more slowly. Defaults to 100.
+    """
+
+    def __init__(self, warm_up=100, **kwargs):
+        super(LinearMomentumEMAHook, self).__init__(**kwargs)
+        self.momentum_fun = lambda x: min(self.momentum**self.interval,
+                                          (1 + x) / (warm_up + x))
diff --git a/mmdet/core/hook/ignoreinvalidloss_hook.py b/mmdet/core/hook/ignoreinvalidloss_hook.py
new file mode 100755
index 0000000..844ae31
--- /dev/null
+++ b/mmdet/core/hook/ignoreinvalidloss_hook.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner.hooks import HOOKS, Hook
+from torch import autograd
+
+@HOOKS.register_module()
+class IgnoreInvalidLossHook(Hook):
+    """Check invalid loss hook.
+
+    This hook will regularly check whether the loss is valid
+    during training.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def after_train_iter(self, runner):
+        if self.every_n_iters(runner, 1):
+            with autograd.detect_anomaly():
+                runner.optimizer.zero_grad()
+                runner.outputs['loss'].backward()
+                allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
+                if self.grad_clip is not None:
+                    self.clip_grads(runner.model.parameters())
+                runner.optimizer.step()
diff --git a/mmdet/core/hook/memory_profiler_hook.py b/mmdet/core/hook/memory_profiler_hook.py
new file mode 100755
index 0000000..a473061
--- /dev/null
+++ b/mmdet/core/hook/memory_profiler_hook.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner.hooks import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class MemoryProfilerHook(Hook):
+    """Memory profiler hook recording memory information including virtual
+    memory, swap memory, and the memory of the current process.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval=50):
+        try:
+            from psutil import swap_memory, virtual_memory
+            self._swap_memory = swap_memory
+            self._virtual_memory = virtual_memory
+        except ImportError:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        try:
+            from memory_profiler import memory_usage
+            self._memory_usage = memory_usage
+        except ImportError:
+            raise ImportError(
+                'memory_profiler is not installed, please install it by: '
+                'pip install memory_profiler')
+
+        self.interval = interval
+
+    def after_iter(self, runner):
+        if self.every_n_iters(runner, self.interval):
+            # in Byte
+            virtual_memory = self._virtual_memory()
+            swap_memory = self._swap_memory()
+            # in MB
+            process_memory = self._memory_usage()[0]
+            factor = 1024 * 1024
+            runner.logger.info(
+                'Memory information '
+                'available_memory: '
+                f'{round(virtual_memory.available / factor)} MB, '
+                'used_memory: '
+                f'{round(virtual_memory.used / factor)} MB, '
+                f'memory_utilization: {virtual_memory.percent} %, '
+                'available_swap_memory: '
+                f'{round((swap_memory.total - swap_memory.used) / factor)}'
+                ' MB, '
+                f'used_swap_memory: {round(swap_memory.used / factor)} MB, '
+                f'swap_memory_utilization: {swap_memory.percent} %, '
+                'current_process_memory: '
+                f'{round(process_memory)} MB')
diff --git a/mmdet/core/hook/set_epoch_info_hook.py b/mmdet/core/hook/set_epoch_info_hook.py
new file mode 100755
index 0000000..c2b134c
--- /dev/null
+++ b/mmdet/core/hook/set_epoch_info_hook.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class SetEpochInfoHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def before_train_epoch(self, runner):
+        epoch = runner.epoch
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        model.set_epoch(epoch)
diff --git a/mmdet/core/hook/sync_norm_hook.py b/mmdet/core/hook/sync_norm_hook.py
new file mode 100755
index 0000000..82931ce
--- /dev/null
+++ b/mmdet/core/hook/sync_norm_hook.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmcv.runner import get_dist_info
+from mmcv.runner.hooks import HOOKS, Hook
+from torch import nn
+
+from ..utils.dist_utils import all_reduce_dict
+
+
+def get_norm_states(module):
+    async_norm_states = OrderedDict()
+    for name, child in module.named_modules():
+        if isinstance(child, nn.modules.batchnorm._NormBase):
+            for k, v in child.state_dict().items():
+                async_norm_states['.'.join([name, k])] = v
+    return async_norm_states
+
+
+@HOOKS.register_module()
+class SyncNormHook(Hook):
+    """Synchronize Norm states after training epoch, currently used in YOLOX.
+
+    Args:
+        num_last_epochs (int): The number of latter epochs in the end of the
+            training to switch to synchronizing norm interval. Default: 15.
+        interval (int): Synchronizing norm interval. Default: 1.
+    """
+
+    def __init__(self, num_last_epochs=15, interval=1):
+        self.interval = interval
+        self.num_last_epochs = num_last_epochs
+
+    def before_train_epoch(self, runner):
+        epoch = runner.epoch
+        if (epoch + 1) == runner.max_epochs - self.num_last_epochs:
+            # Synchronize norm every epoch.
+            self.interval = 1
+
+    def after_train_epoch(self, runner):
+        """Synchronizing norm."""
+        epoch = runner.epoch
+        module = runner.model
+        if (epoch + 1) % self.interval == 0:
+            _, world_size = get_dist_info()
+            if world_size == 1:
+                return
+            norm_states = get_norm_states(module)
+            if len(norm_states) == 0:
+                return
+            norm_states = all_reduce_dict(norm_states, op='mean')
+            module.load_state_dict(norm_states, strict=False)
diff --git a/mmdet/core/hook/sync_random_size_hook.py b/mmdet/core/hook/sync_random_size_hook.py
new file mode 100755
index 0000000..6d7e96c
--- /dev/null
+++ b/mmdet/core/hook/sync_random_size_hook.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+
+import torch
+from mmcv.runner import get_dist_info
+from mmcv.runner.hooks import HOOKS, Hook
+from torch import distributed as dist
+
+
+@HOOKS.register_module()
+class SyncRandomSizeHook(Hook):
+    """Change and synchronize the random image size across ranks.
+    SyncRandomSizeHook is deprecated, please use Resize pipeline to achieve
+    similar functions. Such as `dict(type='Resize', img_scale=[(448, 448),
+    (832, 832)], multiscale_mode='range', keep_ratio=True)`.
+
+    Note: Due to the multi-process dataloader, its behavior is different
+    from YOLOX's official implementation, the official is to change the
+    size every fixed iteration interval and what we achieved is a fixed
+    epoch interval.
+
+    Args:
+        ratio_range (tuple[int]): Random ratio range. It will be multiplied
+            by 32, and then change the dataset output image size.
+            Default: (14, 26).
+        img_scale (tuple[int]): Size of input image. Default: (640, 640).
+        interval (int): The epoch interval of change image size. Default: 1.
+        device (torch.device | str): device for returned tensors.
+            Default: 'cuda'.
+    """
+
+    def __init__(self,
+                 ratio_range=(14, 26),
+                 img_scale=(640, 640),
+                 interval=1,
+                 device='cuda'):
+        warnings.warn('DeprecationWarning: SyncRandomSizeHook is deprecated. '
+                      'Please use Resize pipeline to achieve similar '
+                      'functions. Due to the multi-process dataloader, '
+                      'its behavior is different from YOLOX\'s official '
+                      'implementation, the official is to change the size '
+                      'every fixed iteration interval and what we achieved '
+                      'is a fixed epoch interval.')
+        self.rank, world_size = get_dist_info()
+        self.is_distributed = world_size > 1
+        self.ratio_range = ratio_range
+        self.img_scale = img_scale
+        self.interval = interval
+        self.device = device
+
+    def after_train_epoch(self, runner):
+        """Change the dataset output image size."""
+        if self.ratio_range is not None and (runner.epoch +
+                                             1) % self.interval == 0:
+            # Due to DDP and DP get the device behavior inconsistent,
+            # so we did not get the device from runner.model.
+            tensor = torch.LongTensor(2).to(self.device)
+
+            if self.rank == 0:
+                size_factor = self.img_scale[1] * 1. / self.img_scale[0]
+                size = random.randint(*self.ratio_range)
+                size = (int(32 * size), 32 * int(size * size_factor))
+                tensor[0] = size[0]
+                tensor[1] = size[1]
+
+            if self.is_distributed:
+                dist.barrier()
+                dist.broadcast(tensor, 0)
+
+            runner.data_loader.dataset.update_dynamic_scale(
+                (tensor[0].item(), tensor[1].item()))
diff --git a/mmdet/core/hook/wandblogger_hook.py b/mmdet/core/hook/wandblogger_hook.py
new file mode 100755
index 0000000..7bf252f
--- /dev/null
+++ b/mmdet/core/hook/wandblogger_hook.py
@@ -0,0 +1,593 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+import os.path as osp
+import sys
+import warnings
+
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+from mmcv.runner import HOOKS
+from mmcv.runner.dist_utils import master_only
+from mmcv.runner.hooks.checkpoint import CheckpointHook
+from mmcv.runner.hooks.logger.wandb import WandbLoggerHook
+from mmcv.utils import digit_version
+
+from mmdet.core import DistEvalHook, EvalHook
+from mmdet.core.mask.structures import polygon_to_bitmap
+
+
+@HOOKS.register_module()
+class MMDetWandbHook(WandbLoggerHook):
+    """Enhanced Wandb logger hook for MMDetection.
+
+    Comparing with the :cls:`mmcv.runner.WandbLoggerHook`, this hook can not
+    only automatically log all the metrics but also log the following extra
+    information - saves model checkpoints as W&B Artifact, and
+    logs model prediction as interactive W&B Tables.
+
+    - Metrics: The MMDetWandbHook will automatically log training
+        and validation metrics along with system metrics (CPU/GPU).
+
+    - Checkpointing: If `log_checkpoint` is True, the checkpoint saved at
+        every checkpoint interval will be saved as W&B Artifacts.
+        This depends on the : class:`mmcv.runner.CheckpointHook` whose priority
+        is higher than this hook. Please refer to
+        https://docs.wandb.ai/guides/artifacts/model-versioning
+        to learn more about model versioning with W&B Artifacts.
+
+    - Checkpoint Metadata: If evaluation results are available for a given
+        checkpoint artifact, it will have a metadata associated with it.
+        The metadata contains the evaluation metrics computed on validation
+        data with that checkpoint along with the current epoch. It depends
+        on `EvalHook` whose priority is more than MMDetWandbHook.
+
+    - Evaluation: At every evaluation interval, the `MMDetWandbHook` logs the
+        model prediction as interactive W&B Tables. The number of samples
+        logged is given by `num_eval_images`. Currently, the `MMDetWandbHook`
+        logs the predicted bounding boxes along with the ground truth at every
+        evaluation interval. This depends on the `EvalHook` whose priority is
+        more than `MMDetWandbHook`. Also note that the data is just logged once
+        and subsequent evaluation tables uses reference to the logged data
+        to save memory usage. Please refer to
+        https://docs.wandb.ai/guides/data-vis to learn more about W&B Tables.
+
+    For more details check out W&B's MMDetection docs:
+    https://docs.wandb.ai/guides/integrations/mmdetection
+
+    ```
+    Example:
+        log_config = dict(
+            ...
+            hooks=[
+                ...,
+                dict(type='MMDetWandbHook',
+                     init_kwargs={
+                         'entity': "YOUR_ENTITY",
+                         'project': "YOUR_PROJECT_NAME"
+                     },
+                     interval=50,
+                     log_checkpoint=True,
+                     log_checkpoint_metadata=True,
+                     num_eval_images=100,
+                     bbox_score_thr=0.3)
+            ])
+    ```
+
+    Args:
+        init_kwargs (dict): A dict passed to wandb.init to initialize
+            a W&B run. Please refer to https://docs.wandb.ai/ref/python/init
+            for possible key-value pairs.
+        interval (int): Logging interval (every k iterations). Defaults to 50.
+        log_checkpoint (bool): Save the checkpoint at every checkpoint interval
+            as W&B Artifacts. Use this for model versioning where each version
+            is a checkpoint. Defaults to False.
+        log_checkpoint_metadata (bool): Log the evaluation metrics computed
+            on the validation data with the checkpoint, along with current
+            epoch as a metadata to that checkpoint.
+            Defaults to True.
+        num_eval_images (int): The number of validation images to be logged.
+            If zero, the evaluation won't be logged. Defaults to 100.
+        bbox_score_thr (float): Threshold for bounding box scores.
+            Defaults to 0.3.
+    """
+
+    def __init__(self,
+                 init_kwargs=None,
+                 interval=50,
+                 log_checkpoint=False,
+                 log_checkpoint_metadata=False,
+                 num_eval_images=100,
+                 bbox_score_thr=0.3,
+                 **kwargs):
+        super(MMDetWandbHook, self).__init__(init_kwargs, interval, **kwargs)
+
+        self.log_checkpoint = log_checkpoint
+        self.log_checkpoint_metadata = (
+            log_checkpoint and log_checkpoint_metadata)
+        self.num_eval_images = num_eval_images
+        self.bbox_score_thr = bbox_score_thr
+        self.log_evaluation = (num_eval_images > 0)
+        self.ckpt_hook: CheckpointHook = None
+        self.eval_hook: EvalHook = None
+
+    def import_wandb(self):
+        try:
+            import wandb
+            from wandb import init  # noqa
+
+            # Fix ResourceWarning when calling wandb.log in wandb v0.12.10.
+            # https://github.com/wandb/client/issues/2837
+            if digit_version(wandb.__version__) < digit_version('0.12.10'):
+                warnings.warn(
+                    f'The current wandb {wandb.__version__} is '
+                    f'lower than v0.12.10 will cause ResourceWarning '
+                    f'when calling wandb.log, Please run '
+                    f'"pip install --upgrade wandb"')
+
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install "wandb>=0.12.10"" to install wandb')
+        self.wandb = wandb
+
+    @master_only
+    def before_run(self, runner):
+        super(MMDetWandbHook, self).before_run(runner)
+
+        # Save and Log config.
+        if runner.meta is not None and runner.meta.get('exp_name',
+                                                       None) is not None:
+            src_cfg_path = osp.join(runner.work_dir,
+                                    runner.meta.get('exp_name', None))
+            if osp.exists(src_cfg_path):
+                self.wandb.save(src_cfg_path, base_path=runner.work_dir)
+                self._update_wandb_config(runner)
+        else:
+            runner.logger.warning('No meta information found in the runner. ')
+
+        # Inspect CheckpointHook and EvalHook
+        for hook in runner.hooks:
+            if isinstance(hook, CheckpointHook):
+                self.ckpt_hook = hook
+            if isinstance(hook, (EvalHook, DistEvalHook)):
+                self.eval_hook = hook
+
+        # Check conditions to log checkpoint
+        if self.log_checkpoint:
+            if self.ckpt_hook is None:
+                self.log_checkpoint = False
+                self.log_checkpoint_metadata = False
+                runner.logger.warning(
+                    'To log checkpoint in MMDetWandbHook, `CheckpointHook` is'
+                    'required, please check hooks in the runner.')
+            else:
+                self.ckpt_interval = self.ckpt_hook.interval
+
+        # Check conditions to log evaluation
+        if self.log_evaluation or self.log_checkpoint_metadata:
+            if self.eval_hook is None:
+                self.log_evaluation = False
+                self.log_checkpoint_metadata = False
+                runner.logger.warning(
+                    'To log evaluation or checkpoint metadata in '
+                    'MMDetWandbHook, `EvalHook` or `DistEvalHook` in mmdet '
+                    'is required, please check whether the validation '
+                    'is enabled.')
+            else:
+                self.eval_interval = self.eval_hook.interval
+                self.val_dataset = self.eval_hook.dataloader.dataset
+                # Determine the number of samples to be logged.
+                if self.num_eval_images > len(self.val_dataset):
+                    self.num_eval_images = len(self.val_dataset)
+                    runner.logger.warning(
+                        f'The num_eval_images ({self.num_eval_images}) is '
+                        'greater than the total number of validation samples '
+                        f'({len(self.val_dataset)}). The complete validation '
+                        'dataset will be logged.')
+
+        # Check conditions to log checkpoint metadata
+        if self.log_checkpoint_metadata:
+            assert self.ckpt_interval % self.eval_interval == 0, \
+                'To log checkpoint metadata in MMDetWandbHook, the interval ' \
+                f'of checkpoint saving ({self.ckpt_interval}) should be ' \
+                'divisible by the interval of evaluation ' \
+                f'({self.eval_interval}).'
+
+        # Initialize evaluation table
+        if self.log_evaluation:
+            # Initialize data table
+            self._init_data_table()
+            # Add data to the data table
+            self._add_ground_truth(runner)
+            # Log ground truth data
+            self._log_data_table()
+
+    @master_only
+    def after_train_epoch(self, runner):
+        super(MMDetWandbHook, self).after_train_epoch(runner)
+
+        if not self.by_epoch:
+            return
+
+        # Log checkpoint and metadata.
+        if (self.log_checkpoint
+                and self.every_n_epochs(runner, self.ckpt_interval)
+                or (self.ckpt_hook.save_last and self.is_last_epoch(runner))):
+            if self.log_checkpoint_metadata and self.eval_hook:
+                metadata = {
+                    'epoch': runner.epoch + 1,
+                    **self._get_eval_results()
+                }
+            else:
+                metadata = None
+            aliases = [f'epoch_{runner.epoch + 1}', 'latest']
+            model_path = osp.join(self.ckpt_hook.out_dir,
+                                  f'epoch_{runner.epoch + 1}.pth')
+            self._log_ckpt_as_artifact(model_path, aliases, metadata)
+
+        # Save prediction table
+        if self.log_evaluation and self.eval_hook._should_evaluate(runner):
+            results = self.eval_hook.latest_results
+            # Initialize evaluation table
+            self._init_pred_table()
+            # Log predictions
+            self._log_predictions(results)
+            # Log the table
+            self._log_eval_table(runner.epoch + 1)
+
+    # for the reason of this double-layered structure, refer to
+    # https://github.com/open-mmlab/mmdetection/issues/8145#issuecomment-1345343076
+    def after_train_iter(self, runner):
+        if self.get_mode(runner) == 'train':
+            # An ugly patch. The iter-based eval hook will call the
+            # `after_train_iter` method of all logger hooks before evaluation.
+            # Use this trick to skip that call.
+            # Don't call super method at first, it will clear the log_buffer
+            return super(MMDetWandbHook, self).after_train_iter(runner)
+        else:
+            super(MMDetWandbHook, self).after_train_iter(runner)
+        self._after_train_iter(runner)
+
+    @master_only
+    def _after_train_iter(self, runner):
+        if self.by_epoch:
+            return
+
+        # Save checkpoint and metadata
+        if (self.log_checkpoint
+                and self.every_n_iters(runner, self.ckpt_interval)
+                or (self.ckpt_hook.save_last and self.is_last_iter(runner))):
+            if self.log_checkpoint_metadata and self.eval_hook:
+                metadata = {
+                    'iter': runner.iter + 1,
+                    **self._get_eval_results()
+                }
+            else:
+                metadata = None
+            aliases = [f'iter_{runner.iter + 1}', 'latest']
+            model_path = osp.join(self.ckpt_hook.out_dir,
+                                  f'iter_{runner.iter + 1}.pth')
+            self._log_ckpt_as_artifact(model_path, aliases, metadata)
+
+        # Save prediction table
+        if self.log_evaluation and self.eval_hook._should_evaluate(runner):
+            results = self.eval_hook.latest_results
+            # Initialize evaluation table
+            self._init_pred_table()
+            # Log predictions
+            self._log_predictions(results)
+            # Log the table
+            self._log_eval_table(runner.iter + 1)
+
+    @master_only
+    def after_run(self, runner):
+        self.wandb.finish()
+
+    def _update_wandb_config(self, runner):
+        """Update wandb config."""
+        # Import the config file.
+        sys.path.append(runner.work_dir)
+        config_filename = runner.meta['exp_name'][:-3]
+        configs = importlib.import_module(config_filename)
+        # Prepare a nested dict of config variables.
+        config_keys = [key for key in dir(configs) if not key.startswith('__')]
+        config_dict = {key: getattr(configs, key) for key in config_keys}
+        # Update the W&B config.
+        self.wandb.config.update(config_dict)
+
+    def _log_ckpt_as_artifact(self, model_path, aliases, metadata=None):
+        """Log model checkpoint as  W&B Artifact.
+
+        Args:
+            model_path (str): Path of the checkpoint to log.
+            aliases (list): List of the aliases associated with this artifact.
+            metadata (dict, optional): Metadata associated with this artifact.
+        """
+        model_artifact = self.wandb.Artifact(
+            f'run_{self.wandb.run.id}_model', type='model', metadata=metadata)
+        model_artifact.add_file(model_path)
+        self.wandb.log_artifact(model_artifact, aliases=aliases)
+
+    def _get_eval_results(self):
+        """Get model evaluation results."""
+        results = self.eval_hook.latest_results
+        eval_results = self.val_dataset.evaluate(
+            results, logger='silent', **self.eval_hook.eval_kwargs)
+        return eval_results
+
+    def _init_data_table(self):
+        """Initialize the W&B Tables for validation data."""
+        columns = ['image_name', 'image']
+        self.data_table = self.wandb.Table(columns=columns)
+
+    def _init_pred_table(self):
+        """Initialize the W&B Tables for model evaluation."""
+        columns = ['image_name', 'ground_truth', 'prediction']
+        self.eval_table = self.wandb.Table(columns=columns)
+
+    def _add_ground_truth(self, runner):
+        # Get image loading pipeline
+        from mmdet.datasets.pipelines import LoadImageFromFile
+        img_loader = None
+        for t in self.val_dataset.pipeline.transforms:
+            if isinstance(t, LoadImageFromFile):
+                img_loader = t
+
+        if img_loader is None:
+            self.log_evaluation = False
+            runner.logger.warning(
+                'LoadImageFromFile is required to add images '
+                'to W&B Tables.')
+            return
+
+        # Select the images to be logged.
+        self.eval_image_indexs = np.arange(len(self.val_dataset))
+        # Set seed so that same validation set is logged each time.
+        np.random.seed(42)
+        np.random.shuffle(self.eval_image_indexs)
+        self.eval_image_indexs = self.eval_image_indexs[:self.num_eval_images]
+
+        CLASSES = self.val_dataset.CLASSES
+        self.class_id_to_label = {
+            id + 1: name
+            for id, name in enumerate(CLASSES)
+        }
+        self.class_set = self.wandb.Classes([{
+            'id': id,
+            'name': name
+        } for id, name in self.class_id_to_label.items()])
+
+        img_prefix = self.val_dataset.img_prefix
+
+        for idx in self.eval_image_indexs:
+            img_info = self.val_dataset.data_infos[idx]
+            image_name = img_info.get('filename', f'img_{idx}')
+            img_height, img_width = img_info['height'], img_info['width']
+
+            img_meta = img_loader(
+                dict(img_info=img_info, img_prefix=img_prefix))
+
+            # Get image and convert from BGR to RGB
+            image = mmcv.bgr2rgb(img_meta['img'])
+
+            data_ann = self.val_dataset.get_ann_info(idx)
+            bboxes = data_ann['bboxes']
+            labels = data_ann['labels']
+            masks = data_ann.get('masks', None)
+
+            # Get dict of bounding boxes to be logged.
+            assert len(bboxes) == len(labels)
+            wandb_boxes = self._get_wandb_bboxes(bboxes, labels)
+
+            # Get dict of masks to be logged.
+            if masks is not None:
+                wandb_masks = self._get_wandb_masks(
+                    masks,
+                    labels,
+                    is_poly_mask=True,
+                    height=img_height,
+                    width=img_width)
+            else:
+                wandb_masks = None
+            # TODO: Panoramic segmentation visualization.
+
+            # Log a row to the data table.
+            self.data_table.add_data(
+                image_name,
+                self.wandb.Image(
+                    image,
+                    boxes=wandb_boxes,
+                    masks=wandb_masks,
+                    classes=self.class_set))
+
+    def _log_predictions(self, results):
+        table_idxs = self.data_table_ref.get_index()
+        assert len(table_idxs) == len(self.eval_image_indexs)
+
+        for ndx, eval_image_index in enumerate(self.eval_image_indexs):
+            # Get the result
+            result = results[eval_image_index]
+            if isinstance(result, tuple):
+                bbox_result, segm_result = result
+                if isinstance(segm_result, tuple):
+                    segm_result = segm_result[0]  # ms rcnn
+            else:
+                bbox_result, segm_result = result, None
+            assert len(bbox_result) == len(self.class_id_to_label)
+
+            # Get labels
+            bboxes = np.vstack(bbox_result)
+            labels = [
+                np.full(bbox.shape[0], i, dtype=np.int32)
+                for i, bbox in enumerate(bbox_result)
+            ]
+            labels = np.concatenate(labels)
+
+            # Get segmentation mask if available.
+            segms = None
+            if segm_result is not None and len(labels) > 0:
+                segms = mmcv.concat_list(segm_result)
+                segms = mask_util.decode(segms)
+                segms = segms.transpose(2, 0, 1)
+                assert len(segms) == len(labels)
+            # TODO: Panoramic segmentation visualization.
+
+            # Remove bounding boxes and masks with score lower than threshold.
+            if self.bbox_score_thr > 0:
+                assert bboxes is not None and bboxes.shape[1] == 5
+                scores = bboxes[:, -1]
+                inds = scores > self.bbox_score_thr
+                bboxes = bboxes[inds, :]
+                labels = labels[inds]
+                if segms is not None:
+                    segms = segms[inds, ...]
+
+            # Get dict of bounding boxes to be logged.
+            wandb_boxes = self._get_wandb_bboxes(bboxes, labels, log_gt=False)
+            # Get dict of masks to be logged.
+            if segms is not None:
+                wandb_masks = self._get_wandb_masks(segms, labels)
+            else:
+                wandb_masks = None
+
+            # Log a row to the eval table.
+            self.eval_table.add_data(
+                self.data_table_ref.data[ndx][0],
+                self.data_table_ref.data[ndx][1],
+                self.wandb.Image(
+                    self.data_table_ref.data[ndx][1],
+                    boxes=wandb_boxes,
+                    masks=wandb_masks,
+                    classes=self.class_set))
+
+    def _get_wandb_bboxes(self, bboxes, labels, log_gt=True):
+        """Get list of structured dict for logging bounding boxes to W&B.
+
+        Args:
+            bboxes (list): List of bounding box coordinates in
+                        (minX, minY, maxX, maxY) format.
+            labels (int): List of label ids.
+            log_gt (bool): Whether to log ground truth or prediction boxes.
+
+        Returns:
+            Dictionary of bounding boxes to be logged.
+        """
+        wandb_boxes = {}
+
+        box_data = []
+        for bbox, label in zip(bboxes, labels):
+            if not isinstance(label, int):
+                label = int(label)
+            label = label + 1
+
+            if len(bbox) == 5:
+                confidence = float(bbox[4])
+                class_name = self.class_id_to_label[label]
+                box_caption = f'{class_name} {confidence:.2f}'
+            else:
+                box_caption = str(self.class_id_to_label[label])
+
+            position = dict(
+                minX=int(bbox[0]),
+                minY=int(bbox[1]),
+                maxX=int(bbox[2]),
+                maxY=int(bbox[3]))
+
+            box_data.append({
+                'position': position,
+                'class_id': label,
+                'box_caption': box_caption,
+                'domain': 'pixel'
+            })
+
+        wandb_bbox_dict = {
+            'box_data': box_data,
+            'class_labels': self.class_id_to_label
+        }
+
+        if log_gt:
+            wandb_boxes['ground_truth'] = wandb_bbox_dict
+        else:
+            wandb_boxes['predictions'] = wandb_bbox_dict
+
+        return wandb_boxes
+
+    def _get_wandb_masks(self,
+                         masks,
+                         labels,
+                         is_poly_mask=False,
+                         height=None,
+                         width=None):
+        """Get list of structured dict for logging masks to W&B.
+
+        Args:
+            masks (list): List of masks.
+            labels (int): List of label ids.
+            is_poly_mask (bool): Whether the mask is polygonal or not.
+                This is true for CocoDataset.
+            height (int): Height of the image.
+            width (int): Width of the image.
+
+        Returns:
+            Dictionary of masks to be logged.
+        """
+        mask_label_dict = dict()
+        for mask, label in zip(masks, labels):
+            label = label + 1
+            # Get bitmap mask from polygon.
+            if is_poly_mask:
+                if height is not None and width is not None:
+                    mask = polygon_to_bitmap(mask, height, width)
+            # Create composite masks for each class.
+            if label not in mask_label_dict.keys():
+                mask_label_dict[label] = mask
+            else:
+                mask_label_dict[label] = np.logical_or(mask_label_dict[label],
+                                                       mask)
+
+        wandb_masks = dict()
+        for key, value in mask_label_dict.items():
+            # Create mask for that class.
+            value = value.astype(np.uint8)
+            value[value > 0] = key
+
+            # Create dict of masks for logging.
+            class_name = self.class_id_to_label[key]
+            wandb_masks[class_name] = {
+                'mask_data': value,
+                'class_labels': self.class_id_to_label
+            }
+
+        return wandb_masks
+
+    def _log_data_table(self):
+        """Log the W&B Tables for validation data as artifact and calls
+        `use_artifact` on it so that the evaluation table can use the reference
+        of already uploaded images.
+
+        This allows the data to be uploaded just once.
+        """
+        data_artifact = self.wandb.Artifact('val', type='dataset')
+        data_artifact.add(self.data_table, 'val_data')
+
+        if not self.wandb.run.offline:
+            self.wandb.run.use_artifact(data_artifact)
+            data_artifact.wait()
+            self.data_table_ref = data_artifact.get('val_data')
+        else:
+            self.data_table_ref = self.data_table
+
+    def _log_eval_table(self, idx):
+        """Log the W&B Tables for model evaluation.
+
+        The table will be logged multiple times creating new version. Use this
+        to compare models at different intervals interactively.
+        """
+        pred_artifact = self.wandb.Artifact(
+            f'run_{self.wandb.run.id}_pred', type='evaluation')
+        pred_artifact.add(self.eval_table, 'eval_data')
+        if self.by_epoch:
+            aliases = ['latest', f'epoch_{idx}']
+        else:
+            aliases = ['latest', f'iter_{idx}']
+        self.wandb.run.log_artifact(pred_artifact, aliases=aliases)
diff --git a/mmdet/core/hook/yolox_lrupdater_hook.py b/mmdet/core/hook/yolox_lrupdater_hook.py
new file mode 100755
index 0000000..ecb028e
--- /dev/null
+++ b/mmdet/core/hook/yolox_lrupdater_hook.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner.hooks import HOOKS
+from mmcv.runner.hooks.lr_updater import (CosineAnnealingLrUpdaterHook,
+                                          annealing_cos)
+
+
+@HOOKS.register_module()
+class YOLOXLrUpdaterHook(CosineAnnealingLrUpdaterHook):
+    """YOLOX learning rate scheme.
+
+    There are two main differences between YOLOXLrUpdaterHook
+    and CosineAnnealingLrUpdaterHook.
+
+       1. When the current running epoch is greater than
+           `max_epoch-last_epoch`, a fixed learning rate will be used
+       2. The exp warmup scheme is different with LrUpdaterHook in MMCV
+
+    Args:
+        num_last_epochs (int): The number of epochs with a fixed learning rate
+           before the end of the training.
+    """
+
+    def __init__(self, num_last_epochs, **kwargs):
+        self.num_last_epochs = num_last_epochs
+        super(YOLOXLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_warmup_lr(self, cur_iters):
+
+        def _get_warmup_lr(cur_iters, regular_lr):
+            # exp warmup scheme
+            k = self.warmup_ratio * pow(
+                (cur_iters + 1) / float(self.warmup_iters), 2)
+            warmup_lr = [_lr * k for _lr in regular_lr]
+            return warmup_lr
+
+        if isinstance(self.base_lr, dict):
+            lr_groups = {}
+            for key, base_lr in self.base_lr.items():
+                lr_groups[key] = _get_warmup_lr(cur_iters, base_lr)
+            return lr_groups
+        else:
+            return _get_warmup_lr(cur_iters, self.base_lr)
+
+    def get_lr(self, runner, base_lr):
+        last_iter = len(runner.data_loader) * self.num_last_epochs
+
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+
+        progress += 1
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+
+        if progress >= max_progress - last_iter:
+            # fixed learning rate
+            return target_lr
+        else:
+            return annealing_cos(
+                base_lr, target_lr, (progress - self.warmup_iters) /
+                (max_progress - self.warmup_iters - last_iter))
diff --git a/mmdet/core/hook/yolox_mode_switch_hook.py b/mmdet/core/hook/yolox_mode_switch_hook.py
new file mode 100755
index 0000000..10834e6
--- /dev/null
+++ b/mmdet/core/hook/yolox_mode_switch_hook.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner.hooks import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class YOLOXModeSwitchHook(Hook):
+    """Switch the mode of YOLOX during training.
+
+    This hook turns off the mosaic and mixup data augmentation and switches
+    to use L1 loss in bbox_head.
+
+    Args:
+        num_last_epochs (int): The number of latter epochs in the end of the
+            training to close the data augmentation and switch to L1 loss.
+            Default: 15.
+       skip_type_keys (list[str], optional): Sequence of type string to be
+            skip pipeline. Default: ('Mosaic', 'RandomAffine', 'MixUp')
+    """
+
+    def __init__(self,
+                 num_last_epochs=15,
+                 skip_type_keys=('Mosaic', 'RandomAffine', 'MixUp')):
+        self.num_last_epochs = num_last_epochs
+        self.skip_type_keys = skip_type_keys
+        self._restart_dataloader = False
+
+    def before_train_epoch(self, runner):
+        """Close mosaic and mixup augmentation and switches to use L1 loss."""
+        epoch = runner.epoch
+        train_loader = runner.data_loader
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        if (epoch + 1) == runner.max_epochs - self.num_last_epochs:
+            runner.logger.info('No mosaic and mixup aug now!')
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            train_loader.dataset.update_skip_type_keys(self.skip_type_keys)
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+            runner.logger.info('Add additional L1 loss now!')
+            model.bbox_head.use_l1 = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/mmdet/core/mask/__init__.py b/mmdet/core/mask/__init__.py
new file mode 100755
index 0000000..644a9b1
--- /dev/null
+++ b/mmdet/core/mask/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask_target import mask_target
+from .structures import BaseInstanceMasks, BitmapMasks, PolygonMasks
+from .utils import encode_mask_results, mask2bbox, split_combined_polys
+
+__all__ = [
+    'split_combined_polys', 'mask_target', 'BaseInstanceMasks', 'BitmapMasks',
+    'PolygonMasks', 'encode_mask_results', 'mask2bbox'
+]
diff --git a/mmdet/core/mask/mask_target.py b/mmdet/core/mask/mask_target.py
new file mode 100755
index 0000000..273e767
--- /dev/null
+++ b/mmdet/core/mask/mask_target.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    """Compute mask target for positive proposals in multiple images.
+
+    Args:
+        pos_proposals_list (list[Tensor]): Positive proposals in multiple
+            images.
+        pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each
+            positive proposals.
+        gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of
+            each image.
+        cfg (dict): Config dict that specifies the mask size.
+
+    Returns:
+        list[Tensor]: Mask target of each image.
+
+    Example:
+        >>> import mmcv
+        >>> import mmdet
+        >>> from mmdet.core.mask import BitmapMasks
+        >>> from mmdet.core.mask.mask_target import *
+        >>> H, W = 17, 18
+        >>> cfg = mmcv.Config({'mask_size': (13, 14)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Positive proposals (tl_x, tl_y, br_x, br_y) for each image
+        >>> pos_proposals_list = [
+        >>>     torch.Tensor([
+        >>>         [ 7.2425,  5.5929, 13.9414, 14.9541],
+        >>>         [ 7.3241,  3.6170, 16.3850, 15.3102],
+        >>>     ]),
+        >>>     torch.Tensor([
+        >>>         [ 4.8448, 6.4010, 7.0314, 9.7681],
+        >>>         [ 5.9790, 2.6989, 7.4416, 4.8580],
+        >>>         [ 0.0000, 0.0000, 0.1398, 9.8232],
+        >>>     ]),
+        >>> ]
+        >>> # Corresponding class index for each proposal for each image
+        >>> pos_assigned_gt_inds_list = [
+        >>>     torch.LongTensor([7, 0]),
+        >>>     torch.LongTensor([5, 4, 1]),
+        >>> ]
+        >>> # Ground truth mask for each true object for each image
+        >>> gt_masks_list = [
+        >>>     BitmapMasks(rng.rand(8, H, W), height=H, width=W),
+        >>>     BitmapMasks(rng.rand(6, H, W), height=H, width=W),
+        >>> ]
+        >>> mask_targets = mask_target(
+        >>>     pos_proposals_list, pos_assigned_gt_inds_list,
+        >>>     gt_masks_list, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = list(mask_targets)
+    if len(mask_targets) > 0:
+        mask_targets = torch.cat(mask_targets)
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    """Compute mask target for each positive proposal in the image.
+
+    Args:
+        pos_proposals (Tensor): Positive proposals.
+        pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals.
+        gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap
+            or Polygon.
+        cfg (dict): Config dict that indicate the mask size.
+
+    Returns:
+        Tensor: Mask target of each positive proposals in the image.
+
+    Example:
+        >>> import mmcv
+        >>> import mmdet
+        >>> from mmdet.core.mask import BitmapMasks
+        >>> from mmdet.core.mask.mask_target import *  # NOQA
+        >>> H, W = 32, 32
+        >>> cfg = mmcv.Config({'mask_size': (7, 11)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Masks for each ground truth box (relative to the image)
+        >>> gt_masks_data = rng.rand(3, H, W)
+        >>> gt_masks = BitmapMasks(gt_masks_data, height=H, width=W)
+        >>> # Predicted positive boxes in one image
+        >>> pos_proposals = torch.FloatTensor([
+        >>>     [ 16.2,   5.5, 19.9, 20.9],
+        >>>     [ 17.3,  13.6, 19.3, 19.3],
+        >>>     [ 14.8,  16.4, 17.0, 23.7],
+        >>>     [  0.0,   0.0, 16.0, 16.0],
+        >>>     [  4.0,   0.0, 20.0, 16.0],
+        >>> ])
+        >>> # For each predicted proposal, its assignment to a gt mask
+        >>> pos_assigned_gt_inds = torch.LongTensor([0, 1, 2, 1, 1])
+        >>> mask_targets = mask_target_single(
+        >>>     pos_proposals, pos_assigned_gt_inds, gt_masks, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    device = pos_proposals.device
+    mask_size = _pair(cfg.mask_size)
+    binarize = not cfg.get('soft_mask_target', False)
+    num_pos = pos_proposals.size(0)
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        maxh, maxw = gt_masks.height, gt_masks.width
+        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
+        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+
+        mask_targets = gt_masks.crop_and_resize(
+            proposals_np,
+            mask_size,
+            device=device,
+            inds=pos_assigned_gt_inds,
+            binarize=binarize).to_ndarray()
+
+        mask_targets = torch.from_numpy(mask_targets).float().to(device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, ) + mask_size)
+
+    return mask_targets
diff --git a/mmdet/core/mask/structures.py b/mmdet/core/mask/structures.py
new file mode 100755
index 0000000..7e730dc
--- /dev/null
+++ b/mmdet/core/mask/structures.py
@@ -0,0 +1,1102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import cv2
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from mmcv.ops.roi_align import roi_align
+
+
+class BaseInstanceMasks(metaclass=ABCMeta):
+    """Base class for instance masks."""
+
+    @abstractmethod
+    def rescale(self, scale, interpolation='nearest'):
+        """Rescale masks as large as possible while keeping the aspect ratio.
+        For details can refer to `mmcv.imrescale`.
+
+        Args:
+            scale (tuple[int]): The maximum size (h, w) of rescaled mask.
+            interpolation (str): Same as :func:`mmcv.imrescale`.
+
+        Returns:
+            BaseInstanceMasks: The rescaled masks.
+        """
+
+    @abstractmethod
+    def resize(self, out_shape, interpolation='nearest'):
+        """Resize masks to the given out_shape.
+
+        Args:
+            out_shape: Target (h, w) of resized mask.
+            interpolation (str): See :func:`mmcv.imresize`.
+
+        Returns:
+            BaseInstanceMasks: The resized masks.
+        """
+
+    @abstractmethod
+    def flip(self, flip_direction='horizontal'):
+        """Flip masks alone the given direction.
+
+        Args:
+            flip_direction (str): Either 'horizontal' or 'vertical'.
+
+        Returns:
+            BaseInstanceMasks: The flipped masks.
+        """
+
+    @abstractmethod
+    def pad(self, out_shape, pad_val):
+        """Pad masks to the given size of (h, w).
+
+        Args:
+            out_shape (tuple[int]): Target (h, w) of padded mask.
+            pad_val (int): The padded value.
+
+        Returns:
+            BaseInstanceMasks: The padded masks.
+        """
+
+    @abstractmethod
+    def crop(self, bbox):
+        """Crop each mask by the given bbox.
+
+        Args:
+            bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ).
+
+        Return:
+            BaseInstanceMasks: The cropped masks.
+        """
+
+    @abstractmethod
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device,
+                        interpolation='bilinear',
+                        binarize=True):
+        """Crop and resize masks by the given bboxes.
+
+        This function is mainly used in mask targets computation.
+        It firstly align mask to bboxes by assigned_inds, then crop mask by the
+        assigned bbox and resize to the size of (mask_h, mask_w)
+
+        Args:
+            bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4)
+            out_shape (tuple[int]): Target (h, w) of resized mask
+            inds (ndarray): Indexes to assign masks to each bbox,
+                shape (N,) and values should be between [0, num_masks - 1].
+            device (str): Device of bboxes
+            interpolation (str): See `mmcv.imresize`
+            binarize (bool): if True fractional values are rounded to 0 or 1
+                after the resize operation. if False and unsupported an error
+                will be raised. Defaults to True.
+
+        Return:
+            BaseInstanceMasks: the cropped and resized masks.
+        """
+
+    @abstractmethod
+    def expand(self, expanded_h, expanded_w, top, left):
+        """see :class:`Expand`."""
+
+    @property
+    @abstractmethod
+    def areas(self):
+        """ndarray: areas of each instance."""
+
+    @abstractmethod
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray.
+
+        Return:
+            ndarray: Converted masks in the format of ndarray.
+        """
+
+    @abstractmethod
+    def to_tensor(self, dtype, device):
+        """Convert masks to the format of Tensor.
+
+        Args:
+            dtype (str): Dtype of converted mask.
+            device (torch.device): Device of converted masks.
+
+        Returns:
+            Tensor: Converted masks in the format of Tensor.
+        """
+
+    @abstractmethod
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=0,
+                  interpolation='bilinear'):
+        """Translate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            fill_val (int | float): Border value. Default 0.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            Translated masks.
+        """
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border. Default 0.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            ndarray: Sheared masks.
+        """
+
+    @abstractmethod
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """Rotate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            fill_val (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            Rotated masks.
+        """
+
+
+class BitmapMasks(BaseInstanceMasks):
+    """This class represents masks in the form of bitmaps.
+
+    Args:
+        masks (ndarray): ndarray of masks in shape (N, H, W), where N is
+            the number of objects.
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.core.mask.structures import *  # NOQA
+        >>> num_masks, H, W = 3, 32, 32
+        >>> rng = np.random.RandomState(0)
+        >>> masks = (rng.rand(num_masks, H, W) > 0.1).astype(np.int)
+        >>> self = BitmapMasks(masks, height=H, width=W)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 5
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (14, 14)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        self.height = height
+        self.width = width
+        if len(masks) == 0:
+            self.masks = np.empty((0, self.height, self.width), dtype=np.uint8)
+        else:
+            assert isinstance(masks, (list, np.ndarray))
+            if isinstance(masks, list):
+                assert isinstance(masks[0], np.ndarray)
+                assert masks[0].ndim == 2  # (H, W)
+            else:
+                assert masks.ndim == 3  # (N, H, W)
+
+            self.masks = np.stack(masks).reshape(-1, height, width)
+            assert self.masks.shape[1] == self.height
+            assert self.masks.shape[2] == self.width
+
+    def __getitem__(self, index):
+        """Index the BitmapMask.
+
+        Args:
+            index (int | ndarray): Indices in the format of integer or ndarray.
+
+        Returns:
+            :obj:`BitmapMasks`: Indexed bitmap masks.
+        """
+        masks = self.masks[index].reshape(-1, self.height, self.width)
+        return BitmapMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.rescale`."""
+        if len(self.masks) == 0:
+            new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+            rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
+        else:
+            rescaled_masks = np.stack([
+                mmcv.imrescale(mask, scale, interpolation=interpolation)
+                for mask in self.masks
+            ])
+        height, width = rescaled_masks.shape[1:]
+        return BitmapMasks(rescaled_masks, height, width)
+
+    def resize(self, out_shape, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.resize`."""
+        if len(self.masks) == 0:
+            resized_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            resized_masks = np.stack([
+                mmcv.imresize(
+                    mask, out_shape[::-1], interpolation=interpolation)
+                for mask in self.masks
+            ])
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def flip(self, flip_direction='horizontal'):
+        """See :func:`BaseInstanceMasks.flip`."""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+
+        if len(self.masks) == 0:
+            flipped_masks = self.masks
+        else:
+            flipped_masks = np.stack([
+                mmcv.imflip(mask, direction=flip_direction)
+                for mask in self.masks
+            ])
+        return BitmapMasks(flipped_masks, self.height, self.width)
+
+    def pad(self, out_shape, pad_val=0):
+        """See :func:`BaseInstanceMasks.pad`."""
+        if len(self.masks) == 0:
+            padded_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            padded_masks = np.stack([
+                mmcv.impad(mask, shape=out_shape, pad_val=pad_val)
+                for mask in self.masks
+            ])
+        return BitmapMasks(padded_masks, *out_shape)
+
+    def crop(self, bbox):
+        """See :func:`BaseInstanceMasks.crop`."""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = np.empty((0, h, w), dtype=np.uint8)
+        else:
+            cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w]
+        return BitmapMasks(cropped_masks, h, w)
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """See :func:`BaseInstanceMasks.crop_and_resize`."""
+        if len(self.masks) == 0:
+            empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
+            return BitmapMasks(empty_masks, *out_shape)
+
+        # convert bboxes to tensor
+        if isinstance(bboxes, np.ndarray):
+            bboxes = torch.from_numpy(bboxes).to(device=device)
+        if isinstance(inds, np.ndarray):
+            inds = torch.from_numpy(inds).to(device=device)
+
+        num_bbox = bboxes.shape[0]
+        fake_inds = torch.arange(
+            num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
+        rois = torch.cat([fake_inds, bboxes], dim=1)  # Nx5
+        rois = rois.to(device=device)
+        if num_bbox > 0:
+            gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+                0, inds).to(dtype=rois.dtype)
+            targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+                                1.0, 0, 'avg', True).squeeze(1)
+            if binarize:
+                resized_masks = (targets >= 0.5).cpu().numpy()
+            else:
+                resized_masks = targets.cpu().numpy()
+        else:
+            resized_masks = []
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def expand(self, expanded_h, expanded_w, top, left):
+        """See :func:`BaseInstanceMasks.expand`."""
+        if len(self.masks) == 0:
+            expanded_mask = np.empty((0, expanded_h, expanded_w),
+                                     dtype=np.uint8)
+        else:
+            expanded_mask = np.zeros((len(self), expanded_h, expanded_w),
+                                     dtype=np.uint8)
+            expanded_mask[:, top:top + self.height,
+                          left:left + self.width] = self.masks
+        return BitmapMasks(expanded_mask, expanded_h, expanded_w)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=0,
+                  interpolation='bilinear'):
+        """Translate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            fill_val (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            BitmapMasks: Translated BitmapMasks.
+
+        Example:
+            >>> from mmdet.core.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random(dtype=np.uint8)
+            >>> out_shape = (32, 32)
+            >>> offset = 4
+            >>> direction = 'horizontal'
+            >>> fill_val = 0
+            >>> interpolation = 'bilinear'
+            >>> # Note, There seem to be issues when:
+            >>> # * out_shape is different than self's shape
+            >>> # * the mask dtype is not supported by cv2.AffineWarp
+            >>> new = self.translate(out_shape, offset, direction, fill_val,
+            >>>                      interpolation)
+            >>> assert len(new) == len(self)
+            >>> assert new.height, new.width == out_shape
+        """
+        if len(self.masks) == 0:
+            translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            translated_masks = mmcv.imtranslate(
+                self.masks.transpose((1, 2, 0)),
+                offset,
+                direction,
+                border_value=fill_val,
+                interpolation=interpolation)
+            if translated_masks.ndim == 2:
+                translated_masks = translated_masks[:, :, None]
+            translated_masks = translated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(translated_masks, *out_shape)
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            BitmapMasks: The sheared masks.
+        """
+        if len(self.masks) == 0:
+            sheared_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            sheared_masks = mmcv.imshear(
+                self.masks.transpose((1, 2, 0)),
+                magnitude,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if sheared_masks.ndim == 2:
+                sheared_masks = sheared_masks[:, :, None]
+            sheared_masks = sheared_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(sheared_masks, *out_shape)
+
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """Rotate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            fill_val (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            BitmapMasks: Rotated BitmapMasks.
+        """
+        if len(self.masks) == 0:
+            rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype)
+        else:
+            rotated_masks = mmcv.imrotate(
+                self.masks.transpose((1, 2, 0)),
+                angle,
+                center=center,
+                scale=scale,
+                border_value=fill_val)
+            if rotated_masks.ndim == 2:
+                # case when only one mask, (h, w)
+                rotated_masks = rotated_masks[:, :, None]  # (h, w, 1)
+            rotated_masks = rotated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(rotated_masks, *out_shape)
+
+    @property
+    def areas(self):
+        """See :py:attr:`BaseInstanceMasks.areas`."""
+        return self.masks.sum((1, 2))
+
+    def to_ndarray(self):
+        """See :func:`BaseInstanceMasks.to_ndarray`."""
+        return self.masks
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        return torch.tensor(self.masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               dtype=np.uint8,
+               rng=None):
+        """Generate random bitmap masks for demo / testing purposes.
+
+        Example:
+            >>> from mmdet.core.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random()
+            >>> print('self = {}'.format(self))
+            self = BitmapMasks(num_masks=3, height=32, width=32)
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+        masks = (rng.rand(num_masks, height, width) > 0.1).astype(dtype)
+        self = cls(masks, height=height, width=width)
+        return self
+
+    def get_bboxes(self):
+        num_masks = len(self)
+        boxes = np.zeros((num_masks, 4), dtype=np.float32)
+        x_any = self.masks.any(axis=1)
+        y_any = self.masks.any(axis=2)
+        for idx in range(num_masks):
+            x = np.where(x_any[idx, :])[0]
+            y = np.where(y_any[idx, :])[0]
+            if len(x) > 0 and len(y) > 0:
+                # use +1 for x_max and y_max so that the right and bottom
+                # boundary of instance masks are fully included by the box
+                boxes[idx, :] = np.array([x[0], y[0], x[-1] + 1, y[-1] + 1],
+                                         dtype=np.float32)
+        return boxes
+
+
+class PolygonMasks(BaseInstanceMasks):
+    """This class represents masks in the form of polygons.
+
+    Polygons is a list of three levels. The first level of the list
+    corresponds to objects, the second level to the polys that compose the
+    object, the third level to the poly coordinates
+
+    Args:
+        masks (list[list[ndarray]]): The first level of the list
+            corresponds to objects, the second level to the polys that
+            compose the object, the third level to the poly coordinates
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.core.mask.structures import *  # NOQA
+        >>> masks = [
+        >>>     [ np.array([0, 0, 10, 0, 10, 10., 0, 10, 0, 0]) ]
+        >>> ]
+        >>> height, width = 16, 16
+        >>> self = PolygonMasks(masks, height, width)
+
+        >>> # demo translate
+        >>> new = self.translate((16, 16), 4., direction='horizontal')
+        >>> assert np.all(new.masks[0][0][1::2] == masks[0][0][1::2])
+        >>> assert np.all(new.masks[0][0][0::2] == masks[0][0][0::2] + 4)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 3
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (16, 16)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        assert isinstance(masks, list)
+        if len(masks) > 0:
+            assert isinstance(masks[0], list)
+            assert isinstance(masks[0][0], np.ndarray)
+
+        self.height = height
+        self.width = width
+        self.masks = masks
+
+    def __getitem__(self, index):
+        """Index the polygon masks.
+
+        Args:
+            index (ndarray | List): The indices.
+
+        Returns:
+            :obj:`PolygonMasks`: The indexed polygon masks.
+        """
+        if isinstance(index, np.ndarray):
+            index = index.tolist()
+        if isinstance(index, list):
+            masks = [self.masks[i] for i in index]
+        else:
+            try:
+                masks = self.masks[index]
+            except Exception:
+                raise ValueError(
+                    f'Unsupported input of type {type(index)} for indexing!')
+        if len(masks) and isinstance(masks[0], np.ndarray):
+            masks = [masks]  # ensure a list of three levels
+        return PolygonMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation=None):
+        """see :func:`BaseInstanceMasks.rescale`"""
+        new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+        if len(self.masks) == 0:
+            rescaled_masks = PolygonMasks([], new_h, new_w)
+        else:
+            rescaled_masks = self.resize((new_h, new_w))
+        return rescaled_masks
+
+    def resize(self, out_shape, interpolation=None):
+        """see :func:`BaseInstanceMasks.resize`"""
+        if len(self.masks) == 0:
+            resized_masks = PolygonMasks([], *out_shape)
+        else:
+            h_scale = out_shape[0] / self.height
+            w_scale = out_shape[1] / self.width
+            resized_masks = []
+            for poly_per_obj in self.masks:
+                resized_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p[0::2] = p[0::2] * w_scale
+                    p[1::2] = p[1::2] * h_scale
+                    resized_poly.append(p)
+                resized_masks.append(resized_poly)
+            resized_masks = PolygonMasks(resized_masks, *out_shape)
+        return resized_masks
+
+    def flip(self, flip_direction='horizontal'):
+        """see :func:`BaseInstanceMasks.flip`"""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+        if len(self.masks) == 0:
+            flipped_masks = PolygonMasks([], self.height, self.width)
+        else:
+            flipped_masks = []
+            for poly_per_obj in self.masks:
+                flipped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if flip_direction == 'horizontal':
+                        p[0::2] = self.width - p[0::2]
+                    elif flip_direction == 'vertical':
+                        p[1::2] = self.height - p[1::2]
+                    else:
+                        p[0::2] = self.width - p[0::2]
+                        p[1::2] = self.height - p[1::2]
+                    flipped_poly_per_obj.append(p)
+                flipped_masks.append(flipped_poly_per_obj)
+            flipped_masks = PolygonMasks(flipped_masks, self.height,
+                                         self.width)
+        return flipped_masks
+
+    def crop(self, bbox):
+        """see :func:`BaseInstanceMasks.crop`"""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = PolygonMasks([], h, w)
+        else:
+            cropped_masks = []
+            for poly_per_obj in self.masks:
+                cropped_poly_per_obj = []
+                for p in poly_per_obj:
+                    # pycocotools will clip the boundary
+                    p = p.copy()
+                    p[0::2] = p[0::2] - bbox[0]
+                    p[1::2] = p[1::2] - bbox[1]
+                    cropped_poly_per_obj.append(p)
+                cropped_masks.append(cropped_poly_per_obj)
+            cropped_masks = PolygonMasks(cropped_masks, h, w)
+        return cropped_masks
+
+    def pad(self, out_shape, pad_val=0):
+        """padding has no effect on polygons`"""
+        return PolygonMasks(self.masks, *out_shape)
+
+    def expand(self, *args, **kwargs):
+        """TODO: Add expand for polygon"""
+        raise NotImplementedError
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """see :func:`BaseInstanceMasks.crop_and_resize`"""
+        out_h, out_w = out_shape
+        if len(self.masks) == 0:
+            return PolygonMasks([], out_h, out_w)
+
+        if not binarize:
+            raise ValueError('Polygons are always binary, '
+                             'setting binarize=False is unsupported')
+
+        resized_masks = []
+        for i in range(len(bboxes)):
+            mask = self.masks[inds[i]]
+            bbox = bboxes[i, :]
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1, 1)
+            h = np.maximum(y2 - y1, 1)
+            h_scale = out_h / max(h, 0.1)  # avoid too large scale
+            w_scale = out_w / max(w, 0.1)
+
+            resized_mask = []
+            for p in mask:
+                p = p.copy()
+                # crop
+                # pycocotools will clip the boundary
+                p[0::2] = p[0::2] - bbox[0]
+                p[1::2] = p[1::2] - bbox[1]
+
+                # resize
+                p[0::2] = p[0::2] * w_scale
+                p[1::2] = p[1::2] * h_scale
+                resized_mask.append(p)
+            resized_masks.append(resized_mask)
+        return PolygonMasks(resized_masks, *out_shape)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=None,
+                  interpolation=None):
+        """Translate the PolygonMasks.
+
+        Example:
+            >>> self = PolygonMasks.random(dtype=np.int)
+            >>> out_shape = (self.height, self.width)
+            >>> new = self.translate(out_shape, 4., direction='horizontal')
+            >>> assert np.all(new.masks[0][0][1::2] == self.masks[0][0][1::2])
+            >>> assert np.all(new.masks[0][0][0::2] == self.masks[0][0][0::2] + 4)  # noqa: E501
+        """
+        assert fill_val is None or fill_val == 0, 'Here fill_val is not '\
+            f'used, and defaultly should be None or 0. got {fill_val}.'
+        if len(self.masks) == 0:
+            translated_masks = PolygonMasks([], *out_shape)
+        else:
+            translated_masks = []
+            for poly_per_obj in self.masks:
+                translated_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if direction == 'horizontal':
+                        p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+                    elif direction == 'vertical':
+                        p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+                    translated_poly_per_obj.append(p)
+                translated_masks.append(translated_poly_per_obj)
+            translated_masks = PolygonMasks(translated_masks, *out_shape)
+        return translated_masks
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.shear`."""
+        if len(self.masks) == 0:
+            sheared_masks = PolygonMasks([], *out_shape)
+        else:
+            sheared_masks = []
+            if direction == 'horizontal':
+                shear_matrix = np.stack([[1, magnitude],
+                                         [0, 1]]).astype(np.float32)
+            elif direction == 'vertical':
+                shear_matrix = np.stack([[1, 0], [magnitude,
+                                                  1]]).astype(np.float32)
+            for poly_per_obj in self.masks:
+                sheared_poly = []
+                for p in poly_per_obj:
+                    p = np.stack([p[0::2], p[1::2]], axis=0)  # [2, n]
+                    new_coords = np.matmul(shear_matrix, p)  # [2, n]
+                    new_coords[0, :] = np.clip(new_coords[0, :], 0,
+                                               out_shape[1])
+                    new_coords[1, :] = np.clip(new_coords[1, :], 0,
+                                               out_shape[0])
+                    sheared_poly.append(
+                        new_coords.transpose((1, 0)).reshape(-1))
+                sheared_masks.append(sheared_poly)
+            sheared_masks = PolygonMasks(sheared_masks, *out_shape)
+        return sheared_masks
+
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """See :func:`BaseInstanceMasks.rotate`."""
+        if len(self.masks) == 0:
+            rotated_masks = PolygonMasks([], *out_shape)
+        else:
+            rotated_masks = []
+            rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+            for poly_per_obj in self.masks:
+                rotated_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    coords = np.stack([p[0::2], p[1::2]], axis=1)  # [n, 2]
+                    # pad 1 to convert from format [x, y] to homogeneous
+                    # coordinates format [x, y, 1]
+                    coords = np.concatenate(
+                        (coords, np.ones((coords.shape[0], 1), coords.dtype)),
+                        axis=1)  # [n, 3]
+                    rotated_coords = np.matmul(
+                        rotate_matrix[None, :, :],
+                        coords[:, :, None])[..., 0]  # [n, 2, 1] -> [n, 2]
+                    rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0,
+                                                   out_shape[1])
+                    rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0,
+                                                   out_shape[0])
+                    rotated_poly.append(rotated_coords.reshape(-1))
+                rotated_masks.append(rotated_poly)
+            rotated_masks = PolygonMasks(rotated_masks, *out_shape)
+        return rotated_masks
+
+    def to_bitmap(self):
+        """convert polygon masks to bitmap masks."""
+        bitmap_masks = self.to_ndarray()
+        return BitmapMasks(bitmap_masks, self.height, self.width)
+
+    @property
+    def areas(self):
+        """Compute areas of masks.
+
+        This func is modified from `detectron2
+        <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
+        The function only works with Polygons using the shoelace formula.
+
+        Return:
+            ndarray: areas of each instance
+        """  # noqa: W501
+        area = []
+        for polygons_per_obj in self.masks:
+            area_per_obj = 0
+            for p in polygons_per_obj:
+                area_per_obj += self._polygon_area(p[0::2], p[1::2])
+            area.append(area_per_obj)
+        return np.asarray(area)
+
+    def _polygon_area(self, x, y):
+        """Compute the area of a component of a polygon.
+
+        Using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Args:
+            x (ndarray): x coordinates of the component
+            y (ndarray): y coordinates of the component
+
+        Return:
+            float: the are of the component
+        """  # noqa: 501
+        return 0.5 * np.abs(
+            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray."""
+        if len(self.masks) == 0:
+            return np.empty((0, self.height, self.width), dtype=np.uint8)
+        bitmap_masks = []
+        for poly_per_obj in self.masks:
+            bitmap_masks.append(
+                polygon_to_bitmap(poly_per_obj, self.height, self.width))
+        return np.stack(bitmap_masks)
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        if len(self.masks) == 0:
+            return torch.empty((0, self.height, self.width),
+                               dtype=dtype,
+                               device=device)
+        ndarray_masks = self.to_ndarray()
+        return torch.tensor(ndarray_masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               n_verts=5,
+               dtype=np.float32,
+               rng=None):
+        """Generate random polygon masks for demo / testing purposes.
+
+        Adapted from [1]_
+
+        References:
+            .. [1] https://gitlab.kitware.com/computer-vision/kwimage/-/blob/928cae35ca8/kwimage/structs/polygon.py#L379  # noqa: E501
+
+        Example:
+            >>> from mmdet.core.mask.structures import PolygonMasks
+            >>> self = PolygonMasks.random()
+            >>> print('self = {}'.format(self))
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+
+        def _gen_polygon(n, irregularity, spikeyness):
+            """Creates the polygon by sampling points on a circle around the
+            centre.  Random noise is added by varying the angular spacing
+            between sequential points, and by varying the radial distance of
+            each point from the centre.
+
+            Based on original code by Mike Ounsworth
+
+            Args:
+                n (int): number of vertices
+                irregularity (float): [0,1] indicating how much variance there
+                    is in the angular spacing of vertices. [0,1] will map to
+                    [0, 2pi/numberOfVerts]
+                spikeyness (float): [0,1] indicating how much variance there is
+                    in each vertex from the circle of radius aveRadius. [0,1]
+                    will map to [0, aveRadius]
+
+            Returns:
+                a list of vertices, in CCW order.
+            """
+            from scipy.stats import truncnorm
+
+            # Generate around the unit circle
+            cx, cy = (0.0, 0.0)
+            radius = 1
+
+            tau = np.pi * 2
+
+            irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / n
+            spikeyness = np.clip(spikeyness, 1e-9, 1)
+
+            # generate n angle steps
+            lower = (tau / n) - irregularity
+            upper = (tau / n) + irregularity
+            angle_steps = rng.uniform(lower, upper, n)
+
+            # normalize the steps so that point 0 and point n+1 are the same
+            k = angle_steps.sum() / (2 * np.pi)
+            angles = (angle_steps / k).cumsum() + rng.uniform(0, tau)
+
+            # Convert high and low values to be wrt the standard normal range
+            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html
+            low = 0
+            high = 2 * radius
+            mean = radius
+            std = spikeyness
+            a = (low - mean) / std
+            b = (high - mean) / std
+            tnorm = truncnorm(a=a, b=b, loc=mean, scale=std)
+
+            # now generate the points
+            radii = tnorm.rvs(n, random_state=rng)
+            x_pts = cx + radii * np.cos(angles)
+            y_pts = cy + radii * np.sin(angles)
+
+            points = np.hstack([x_pts[:, None], y_pts[:, None]])
+
+            # Scale to 0-1 space
+            points = points - points.min(axis=0)
+            points = points / points.max(axis=0)
+
+            # Randomly place within 0-1 space
+            points = points * (rng.rand() * .8 + .2)
+            min_pt = points.min(axis=0)
+            max_pt = points.max(axis=0)
+
+            high = (1 - max_pt)
+            low = (0 - min_pt)
+            offset = (rng.rand(2) * (high - low)) + low
+            points = points + offset
+            return points
+
+        def _order_vertices(verts):
+            """
+            References:
+                https://stackoverflow.com/questions/1709283/how-can-i-sort-a-coordinate-list-for-a-rectangle-counterclockwise
+            """
+            mlat = verts.T[0].sum() / len(verts)
+            mlng = verts.T[1].sum() / len(verts)
+
+            tau = np.pi * 2
+            angle = (np.arctan2(mlat - verts.T[0], verts.T[1] - mlng) +
+                     tau) % tau
+            sortx = angle.argsort()
+            verts = verts.take(sortx, axis=0)
+            return verts
+
+        # Generate a random exterior for each requested mask
+        masks = []
+        for _ in range(num_masks):
+            exterior = _order_vertices(_gen_polygon(n_verts, 0.9, 0.9))
+            exterior = (exterior * [(width, height)]).astype(dtype)
+            masks.append([exterior.ravel()])
+
+        self = cls(masks, height, width)
+        return self
+
+    def get_bboxes(self):
+        num_masks = len(self)
+        boxes = np.zeros((num_masks, 4), dtype=np.float32)
+        for idx, poly_per_obj in enumerate(self.masks):
+            # simply use a number that is big enough for comparison with
+            # coordinates
+            xy_min = np.array([self.width * 2, self.height * 2],
+                              dtype=np.float32)
+            xy_max = np.zeros(2, dtype=np.float32)
+            for p in poly_per_obj:
+                xy = np.array(p).reshape(-1, 2).astype(np.float32)
+                xy_min = np.minimum(xy_min, np.min(xy, axis=0))
+                xy_max = np.maximum(xy_max, np.max(xy, axis=0))
+            boxes[idx, :2] = xy_min
+            boxes[idx, 2:] = xy_max
+
+        return boxes
+
+
+def polygon_to_bitmap(polygons, height, width):
+    """Convert masks from the form of polygons to bitmaps.
+
+    Args:
+        polygons (list[ndarray]): masks in polygon representation
+        height (int): mask height
+        width (int): mask width
+
+    Return:
+        ndarray: the converted masks in bitmap representation
+    """
+    rles = maskUtils.frPyObjects(polygons, height, width)
+    rle = maskUtils.merge(rles)
+    bitmap_mask = maskUtils.decode(rle).astype(bool)
+    return bitmap_mask
+
+
+def bitmap_to_polygon(bitmap):
+    """Convert masks from the form of bitmaps to polygons.
+
+    Args:
+        bitmap (ndarray): masks in bitmap representation.
+
+    Return:
+        list[ndarray]: the converted mask in polygon representation.
+        bool: whether the mask has holes.
+    """
+    bitmap = np.ascontiguousarray(bitmap).astype(np.uint8)
+    # cv2.RETR_CCOMP: retrieves all of the contours and organizes them
+    #   into a two-level hierarchy. At the top level, there are external
+    #   boundaries of the components. At the second level, there are
+    #   boundaries of the holes. If there is another contour inside a hole
+    #   of a connected component, it is still put at the top level.
+    # cv2.CHAIN_APPROX_NONE: stores absolutely all the contour points.
+    outs = cv2.findContours(bitmap, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+    contours = outs[-2]
+    hierarchy = outs[-1]
+    if hierarchy is None:
+        return [], False
+    # hierarchy[i]: 4 elements, for the indexes of next, previous,
+    # parent, or nested contours. If there is no corresponding contour,
+    # it will be -1.
+    with_hole = (hierarchy.reshape(-1, 4)[:, 3] >= 0).any()
+    contours = [c.reshape(-1, 2) for c in contours]
+    return contours, with_hole
diff --git a/mmdet/core/mask/utils.py b/mmdet/core/mask/utils.py
new file mode 100755
index 0000000..90544b3
--- /dev/null
+++ b/mmdet/core/mask/utils.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of \
+            list (length = poly num) of numpy array.
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = mmcv.slice_list(polys_single, polys_lens_single)
+        mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
+
+
+# TODO: move this function to more proper place
+def encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code.
+
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    if isinstance(mask_results, tuple):  # mask scoring
+        cls_segms, cls_mask_scores = mask_results
+    else:
+        cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = [[] for _ in range(num_classes)]
+    for i in range(len(cls_segms)):
+        for cls_segm in cls_segms[i]:
+            encoded_mask_results[i].append(
+                mask_util.encode(
+                    np.array(
+                        cls_segm[:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    if isinstance(mask_results, tuple):
+        return encoded_mask_results, cls_mask_scores
+    else:
+        return encoded_mask_results
+
+
+def mask2bbox(masks):
+    """Obtain tight bounding boxes of binary masks.
+
+    Args:
+        masks (Tensor): Binary mask of shape (n, h, w).
+
+    Returns:
+        Tensor: Bboxe with shape (n, 4) of \
+            positive region in binary mask.
+    """
+    N = masks.shape[0]
+    bboxes = masks.new_zeros((N, 4), dtype=torch.float32)
+    x_any = torch.any(masks, dim=1)
+    y_any = torch.any(masks, dim=2)
+    for i in range(N):
+        x = torch.where(x_any[i, :])[0]
+        y = torch.where(y_any[i, :])[0]
+        if len(x) > 0 and len(y) > 0:
+            bboxes[i, :] = bboxes.new_tensor(
+                [x[0], y[0], x[-1] + 1, y[-1] + 1])
+
+    return bboxes
diff --git a/mmdet/core/optimizers/__init__.py b/mmdet/core/optimizers/__init__.py
new file mode 100755
index 0000000..e867d07
--- /dev/null
+++ b/mmdet/core/optimizers/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import OPTIMIZER_BUILDERS, build_optimizer
+from .layer_decay_optimizer_constructor import \
+    LearningRateDecayOptimizerConstructor
+
+__all__ = [
+    'LearningRateDecayOptimizerConstructor', 'OPTIMIZER_BUILDERS',
+    'build_optimizer'
+]
diff --git a/mmdet/core/optimizers/builder.py b/mmdet/core/optimizers/builder.py
new file mode 100755
index 0000000..406dd9b
--- /dev/null
+++ b/mmdet/core/optimizers/builder.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS
+from mmcv.utils import Registry, build_from_cfg
+
+OPTIMIZER_BUILDERS = Registry(
+    'optimizer builder', parent=MMCV_OPTIMIZER_BUILDERS)
+
+
+def build_optimizer_constructor(cfg):
+    constructor_type = cfg.get('type')
+    if constructor_type in OPTIMIZER_BUILDERS:
+        return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
+    elif constructor_type in MMCV_OPTIMIZER_BUILDERS:
+        return build_from_cfg(cfg, MMCV_OPTIMIZER_BUILDERS)
+    else:
+        raise KeyError(f'{constructor_type} is not registered '
+                       'in the optimizer builder registry.')
+
+
+def build_optimizer(model, cfg):
+    optimizer_cfg = copy.deepcopy(cfg)
+    constructor_type = optimizer_cfg.pop('constructor',
+                                         'DefaultOptimizerConstructor')
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    optim_constructor = build_optimizer_constructor(
+        dict(
+            type=constructor_type,
+            optimizer_cfg=optimizer_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optimizer = optim_constructor(model)
+    return optimizer
diff --git a/mmdet/core/optimizers/layer_decay_optimizer_constructor.py b/mmdet/core/optimizers/layer_decay_optimizer_constructor.py
new file mode 100755
index 0000000..1bc3469
--- /dev/null
+++ b/mmdet/core/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
+
+from mmdet.utils import get_root_logger
+from .builder import OPTIMIZER_BUILDERS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum layer id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum stage id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimizerConstructor):
+    # Different learning rates are set for different layers of backbone.
+    # Note: Currently, this optimizer constructor is built for ConvNeXt.
+
+    def add_params(self, params, module, **kwargs):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = get_root_logger()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
diff --git a/mmdet/core/post_processing/__init__.py b/mmdet/core/post_processing/__init__.py
new file mode 100755
index 0000000..00376bd
--- /dev/null
+++ b/mmdet/core/post_processing/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_nms import fast_nms, multiclass_nms
+from .matrix_nms import mask_matrix_nms
+from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+                         merge_aug_proposals, merge_aug_scores)
+
+__all__ = [
+    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
+    'merge_aug_scores', 'merge_aug_masks', 'mask_matrix_nms', 'fast_nms'
+]
diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py
new file mode 100755
index 0000000..4fcf57b
--- /dev/null
+++ b/mmdet/core/post_processing/bbox_nms.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops.nms import batched_nms
+
+from mmdet.core.bbox.iou_calculators import bbox_overlaps
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   nms_cfg,
+                   max_num=-1,
+                   score_factors=None,
+                   return_inds=False):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_cfg (dict): a dict that contains the arguments of nms operations
+        max_num (int, optional): if there are more than max_num bboxes after
+            NMS, only top max_num will be kept. Default to -1.
+        score_factors (Tensor, optional): The factors multiplied to scores
+            before applying NMS. Default to None.
+        return_inds (bool, optional): Whether return the indices of kept
+            bboxes. Default to False.
+
+    Returns:
+        tuple: (dets, labels, indices (optional)), tensors of shape (k, 5),
+            (k), and (k). Dets are boxes with scores. Labels are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+
+    scores = multi_scores[:, :-1]
+
+    labels = torch.arange(num_classes, dtype=torch.long, device=scores.device)
+    labels = labels.view(1, -1).expand_as(scores)
+
+    bboxes = bboxes.reshape(-1, 4)
+    scores = scores.reshape(-1)
+    labels = labels.reshape(-1)
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        # remove low scoring boxes
+        valid_mask = scores > score_thr
+    # multiply score_factor after threshold to preserve more bboxes, improve
+    # mAP by 1% for YOLOv3
+    if score_factors is not None:
+        # expand the shape to match original shape of score
+        score_factors = score_factors.view(-1, 1).expand(
+            multi_scores.size(0), num_classes)
+        score_factors = score_factors.reshape(-1)
+        scores = scores * score_factors
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
+        bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
+    else:
+        # TensorRT NMS plugin has invalid output filled with -1
+        # add dummy data to make detection output correct.
+        bboxes = torch.cat([bboxes, bboxes.new_zeros(1, 4)], dim=0)
+        scores = torch.cat([scores, scores.new_zeros(1)], dim=0)
+        labels = torch.cat([labels, labels.new_zeros(1)], dim=0)
+
+    if bboxes.numel() == 0:
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        dets = torch.cat([bboxes, scores[:, None]], -1)
+        if return_inds:
+            return dets, labels, inds
+        else:
+            return dets, labels
+
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    if return_inds:
+        return dets, labels[keep], inds[keep]
+    else:
+        return dets, labels[keep]
+
+
+def fast_nms(multi_bboxes,
+             multi_scores,
+             multi_coeffs,
+             score_thr,
+             iou_thr,
+             top_k,
+             max_num=-1):
+    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
+
+    Fast NMS allows already-removed detections to suppress other detections so
+    that every instance can be decided to be kept or discarded in parallel,
+    which is not possible in traditional NMS. This relaxation allows us to
+    implement Fast NMS entirely in standard GPU-accelerated matrix operations.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        iou_thr (float): IoU threshold to be considered as conflicted.
+        top_k (int): if there are more than top_k bboxes before NMS,
+            only top top_k will be kept.
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept. If -1, keep all the bboxes.
+            Default: -1.
+
+    Returns:
+        tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
+            and (k, coeffs_dim). Dets are boxes with scores.
+            Labels are 0-based.
+    """
+
+    scores = multi_scores[:, :-1].t()  # [#class, n]
+    scores, idx = scores.sort(1, descending=True)
+
+    idx = idx[:, :top_k].contiguous()
+    scores = scores[:, :top_k]  # [#class, topk]
+    num_classes, num_dets = idx.size()
+    boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
+    coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
+
+    iou = bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
+    iou.triu_(diagonal=1)
+    iou_max, _ = iou.max(dim=1)
+
+    # Now just filter out the ones higher than the threshold
+    keep = iou_max <= iou_thr
+
+    # Second thresholding introduces 0.2 mAP gain at negligible time cost
+    keep *= scores > score_thr
+
+    # Assign each kept detection to its corresponding class
+    classes = torch.arange(
+        num_classes, device=boxes.device)[:, None].expand_as(keep)
+    classes = classes[keep]
+
+    boxes = boxes[keep]
+    coeffs = coeffs[keep]
+    scores = scores[keep]
+
+    # Only keep the top max_num highest scores across all classes
+    scores, idx = scores.sort(0, descending=True)
+    if max_num > 0:
+        idx = idx[:max_num]
+        scores = scores[:max_num]
+
+    classes = classes[idx]
+    boxes = boxes[idx]
+    coeffs = coeffs[idx]
+
+    cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
+    return cls_dets, classes, coeffs
diff --git a/mmdet/core/post_processing/matrix_nms.py b/mmdet/core/post_processing/matrix_nms.py
new file mode 100755
index 0000000..9dc8c4f
--- /dev/null
+++ b/mmdet/core/post_processing/matrix_nms.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def mask_matrix_nms(masks,
+                    labels,
+                    scores,
+                    filter_thr=-1,
+                    nms_pre=-1,
+                    max_num=-1,
+                    kernel='gaussian',
+                    sigma=2.0,
+                    mask_area=None):
+    """Matrix NMS for multi-class masks.
+
+    Args:
+        masks (Tensor): Has shape (num_instances, h, w)
+        labels (Tensor): Labels of corresponding masks,
+            has shape (num_instances,).
+        scores (Tensor): Mask scores of corresponding masks,
+            has shape (num_instances).
+        filter_thr (float): Score threshold to filter the masks
+            after matrix nms. Default: -1, which means do not
+            use filter_thr.
+        nms_pre (int): The max number of instances to do the matrix nms.
+            Default: -1, which means do not use nms_pre.
+        max_num (int, optional): If there are more than max_num masks after
+            matrix, only top max_num will be kept. Default: -1, which means
+            do not use max_num.
+        kernel (str): 'linear' or 'gaussian'.
+        sigma (float): std in gaussian method.
+        mask_area (Tensor): The sum of seg_masks.
+
+    Returns:
+        tuple(Tensor): Processed mask results.
+
+            - scores (Tensor): Updated scores, has shape (n,).
+            - labels (Tensor): Remained labels, has shape (n,).
+            - masks (Tensor): Remained masks, has shape (n, w, h).
+            - keep_inds (Tensor): The indices number of
+                the remaining mask in the input mask, has shape (n,).
+    """
+    assert len(labels) == len(masks) == len(scores)
+    if len(labels) == 0:
+        return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+            0, *masks.shape[-2:]), labels.new_zeros(0)
+    if mask_area is None:
+        mask_area = masks.sum((1, 2)).float()
+    else:
+        assert len(masks) == len(mask_area)
+
+    # sort and keep top nms_pre
+    scores, sort_inds = torch.sort(scores, descending=True)
+
+    keep_inds = sort_inds
+    if nms_pre > 0 and len(sort_inds) > nms_pre:
+        sort_inds = sort_inds[:nms_pre]
+        keep_inds = keep_inds[:nms_pre]
+        scores = scores[:nms_pre]
+    masks = masks[sort_inds]
+    mask_area = mask_area[sort_inds]
+    labels = labels[sort_inds]
+
+    num_masks = len(labels)
+    flatten_masks = masks.reshape(num_masks, -1).float()
+    # inter.
+    inter_matrix = torch.mm(flatten_masks, flatten_masks.transpose(1, 0))
+    expanded_mask_area = mask_area.expand(num_masks, num_masks)
+    # Upper triangle iou matrix.
+    iou_matrix = (inter_matrix /
+                  (expanded_mask_area + expanded_mask_area.transpose(1, 0) -
+                   inter_matrix)).triu(diagonal=1)
+    # label_specific matrix.
+    expanded_labels = labels.expand(num_masks, num_masks)
+    # Upper triangle label matrix.
+    label_matrix = (expanded_labels == expanded_labels.transpose(
+        1, 0)).triu(diagonal=1)
+
+    # IoU compensation
+    compensate_iou, _ = (iou_matrix * label_matrix).max(0)
+    compensate_iou = compensate_iou.expand(num_masks,
+                                           num_masks).transpose(1, 0)
+
+    # IoU decay
+    decay_iou = iou_matrix * label_matrix
+
+    # Calculate the decay_coefficient
+    if kernel == 'gaussian':
+        decay_matrix = torch.exp(-1 * sigma * (decay_iou**2))
+        compensate_matrix = torch.exp(-1 * sigma * (compensate_iou**2))
+        decay_coefficient, _ = (decay_matrix / compensate_matrix).min(0)
+    elif kernel == 'linear':
+        decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
+        decay_coefficient, _ = decay_matrix.min(0)
+    else:
+        raise NotImplementedError(
+            f'{kernel} kernel is not supported in matrix nms!')
+    # update the score.
+    scores = scores * decay_coefficient
+
+    if filter_thr > 0:
+        keep = scores >= filter_thr
+        keep_inds = keep_inds[keep]
+        if not keep.any():
+            return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+                0, *masks.shape[-2:]), labels.new_zeros(0)
+        masks = masks[keep]
+        scores = scores[keep]
+        labels = labels[keep]
+
+    # sort and keep top max_num
+    scores, sort_inds = torch.sort(scores, descending=True)
+    keep_inds = keep_inds[sort_inds]
+    if max_num > 0 and len(sort_inds) > max_num:
+        sort_inds = sort_inds[:max_num]
+        keep_inds = keep_inds[:max_num]
+        scores = scores[:max_num]
+    masks = masks[sort_inds]
+    labels = labels[sort_inds]
+
+    return scores, labels, masks, keep_inds
diff --git a/mmdet/core/post_processing/merge_augs.py b/mmdet/core/post_processing/merge_augs.py
new file mode 100755
index 0000000..2ac4603
--- /dev/null
+++ b/mmdet/core/post_processing/merge_augs.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import numpy as np
+import torch
+from mmcv import ConfigDict
+from mmcv.ops import nms
+
+from ..bbox import bbox_mapping_back
+
+
+def merge_aug_proposals(aug_proposals, img_metas, cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+        cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+
+    cfg = copy.deepcopy(cfg)
+
+    # deprecate arguments warning
+    if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+        warnings.warn(
+            'In rpn_proposal or test_cfg, '
+            'nms_thr has been moved to a dict named nms as '
+            'iou_threshold, max_num has been renamed as max_per_img, '
+            'name of original arguments and the way to specify '
+            'iou_threshold of NMS will be deprecated.')
+    if 'nms' not in cfg:
+        cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+    if 'max_num' in cfg:
+        if 'max_per_img' in cfg:
+            assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \
+                f'max_per_img at the same time, but get {cfg.max_num} ' \
+                f'and {cfg.max_per_img} respectively' \
+                f'Please delete max_num which will be deprecated.'
+        else:
+            cfg.max_per_img = cfg.max_num
+    if 'nms_thr' in cfg:
+        assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+            f'iou_threshold in nms and ' \
+            f'nms_thr at the same time, but get ' \
+            f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+            f' respectively. Please delete the nms_thr ' \
+            f'which will be deprecated.'
+
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        flip_direction = img_info['flip_direction']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip,
+                                              flip_direction)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+                              aug_proposals[:, -1].contiguous(),
+                              cfg.nms.iou_threshold)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(cfg.max_per_img, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                   flip_direction)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = []
+    for mask, img_info in zip(aug_masks, img_metas):
+        flip = img_info[0]['flip']
+        if flip:
+            flip_direction = img_info[0]['flip_direction']
+            if flip_direction == 'horizontal':
+                mask = mask[:, :, :, ::-1]
+            elif flip_direction == 'vertical':
+                mask = mask[:, :, ::-1, :]
+            elif flip_direction == 'diagonal':
+                mask = mask[:, :, :, ::-1]
+                mask = mask[:, :, ::-1, :]
+            else:
+                raise ValueError(
+                    f"Invalid flipping direction '{flip_direction}'")
+        recovered_masks.append(mask)
+
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
diff --git a/mmdet/core/utils/__init__.py b/mmdet/core/utils/__init__.py
new file mode 100755
index 0000000..3f0d070
--- /dev/null
+++ b/mmdet/core/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist_utils import (DistOptimizerHook, all_reduce_dict, allreduce_grads,
+                         reduce_mean, sync_random_seed)
+from .misc import (center_of_mass, filter_scores_and_topk, flip_tensor,
+                   generate_coordinate, mask2ndarray, multi_apply,
+                   select_single_mlvl, unmap)
+
+__all__ = [
+    'allreduce_grads', 'DistOptimizerHook', 'reduce_mean', 'multi_apply',
+    'unmap', 'mask2ndarray', 'flip_tensor', 'all_reduce_dict',
+    'center_of_mass', 'generate_coordinate', 'select_single_mlvl',
+    'filter_scores_and_topk', 'sync_random_seed'
+]
diff --git a/mmdet/core/utils/dist_utils.py b/mmdet/core/utils/dist_utils.py
new file mode 100755
index 0000000..8760774
--- /dev/null
+++ b/mmdet/core/utils/dist_utils.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import pickle
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import OptimizerHook, get_dist_info
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+    """Deprecated optimizer hook for distributed training."""
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn('"DistOptimizerHook" is deprecated, please switch to'
+                      '"mmcv.runner.OptimizerHook".')
+        super().__init__(*args, **kwargs)
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+def obj2tensor(pyobj, device='cuda'):
+    """Serialize picklable python object to tensor."""
+    storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
+    return torch.ByteTensor(storage).to(device=device)
+
+
+def tensor2obj(tensor):
+    """Deserialize tensor to picklable python object."""
+    return pickle.loads(tensor.cpu().numpy().tobytes())
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """Return a process group based on gloo backend, containing all the ranks
+    The result is cached."""
+    if dist.get_backend() == 'nccl':
+        return dist.new_group(backend='gloo')
+    else:
+        return dist.group.WORLD
+
+
+def all_reduce_dict(py_dict, op='sum', group=None, to_float=True):
+    """Apply all reduce function for python dict object.
+
+    The code is modified from https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/utils/allreduce_norm.py.
+
+    NOTE: make sure that py_dict in different ranks has the same keys and
+    the values should be in the same shape. Currently only supports
+    nccl backend.
+
+    Args:
+        py_dict (dict): Dict to be applied all reduce op.
+        op (str): Operator, could be 'sum' or 'mean'. Default: 'sum'
+        group (:obj:`torch.distributed.group`, optional): Distributed group,
+            Default: None.
+        to_float (bool): Whether to convert all values of dict to float.
+            Default: True.
+
+    Returns:
+        OrderedDict: reduced python dict object.
+    """
+    warnings.warn(
+        'group` is deprecated. Currently only supports NCCL backend.')
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return py_dict
+
+    # all reduce logic across different devices.
+    py_key = list(py_dict.keys())
+    if not isinstance(py_dict, OrderedDict):
+        py_key_tensor = obj2tensor(py_key)
+        dist.broadcast(py_key_tensor, src=0)
+        py_key = tensor2obj(py_key_tensor)
+
+    tensor_shapes = [py_dict[k].shape for k in py_key]
+    tensor_numels = [py_dict[k].numel() for k in py_key]
+
+    if to_float:
+        warnings.warn('Note: the "to_float" is True, you need to '
+                      'ensure that the behavior is reasonable.')
+        flatten_tensor = torch.cat(
+            [py_dict[k].flatten().float() for k in py_key])
+    else:
+        flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
+
+    dist.all_reduce(flatten_tensor, op=dist.ReduceOp.SUM)
+    if op == 'mean':
+        flatten_tensor /= world_size
+
+    split_tensors = [
+        x.reshape(shape) for x, shape in zip(
+            torch.split(flatten_tensor, tensor_numels), tensor_shapes)
+    ]
+    out_dict = {k: v for k, v in zip(py_key, split_tensors)}
+    if isinstance(py_dict, OrderedDict):
+        out_dict = OrderedDict(out_dict)
+    return out_dict
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed.
+
+    All workers must call this function, otherwise it will deadlock.
+    This method is generally used in `DistributedSampler`,
+    because the seed should be identical across all processes
+    in the distributed group.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/mmdet/core/utils/misc.py b/mmdet/core/utils/misc.py
new file mode 100755
index 0000000..14cb745
--- /dev/null
+++ b/mmdet/core/utils/misc.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import numpy as np
+import torch
+from six.moves import map, zip
+
+from ..mask.structures import BitmapMasks, PolygonMasks
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """Unmap a subset of item (data) back to the original set of items (of size
+    count)"""
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds.type(torch.bool)] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds.type(torch.bool), :] = data
+    return ret
+
+
+def mask2ndarray(mask):
+    """Convert Mask to ndarray..
+
+    Args:
+        mask (:obj:`BitmapMasks` or :obj:`PolygonMasks` or
+        torch.Tensor or np.ndarray): The mask to be converted.
+
+    Returns:
+        np.ndarray: Ndarray mask of shape (n, h, w) that has been converted
+    """
+    if isinstance(mask, (BitmapMasks, PolygonMasks)):
+        mask = mask.to_ndarray()
+    elif isinstance(mask, torch.Tensor):
+        mask = mask.detach().cpu().numpy()
+    elif not isinstance(mask, np.ndarray):
+        raise TypeError(f'Unsupported {type(mask)} data type')
+    return mask
+
+
+def flip_tensor(src_tensor, flip_direction):
+    """flip tensor base on flip_direction.
+
+    Args:
+        src_tensor (Tensor): input feature map, shape (B, C, H, W).
+        flip_direction (str): The flipping direction. Options are
+          'horizontal', 'vertical', 'diagonal'.
+
+    Returns:
+        out_tensor (Tensor): Flipped tensor.
+    """
+    assert src_tensor.ndim == 4
+    valid_directions = ['horizontal', 'vertical', 'diagonal']
+    assert flip_direction in valid_directions
+    if flip_direction == 'horizontal':
+        out_tensor = torch.flip(src_tensor, [3])
+    elif flip_direction == 'vertical':
+        out_tensor = torch.flip(src_tensor, [2])
+    else:
+        out_tensor = torch.flip(src_tensor, [2, 3])
+    return out_tensor
+
+
+def select_single_mlvl(mlvl_tensors, batch_id, detach=True):
+    """Extract a multi-scale single image tensor from a multi-scale batch
+    tensor based on batch index.
+
+    Note: The default value of detach is True, because the proposal gradient
+    needs to be detached during the training of the two-stage model. E.g
+    Cascade Mask R-CNN.
+
+    Args:
+        mlvl_tensors (list[Tensor]): Batch tensor for all scale levels,
+           each is a 4D-tensor.
+        batch_id (int): Batch index.
+        detach (bool): Whether detach gradient. Default True.
+
+    Returns:
+        list[Tensor]: Multi-scale single image tensor.
+    """
+    assert isinstance(mlvl_tensors, (list, tuple))
+    num_levels = len(mlvl_tensors)
+
+    if detach:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id].detach() for i in range(num_levels)
+        ]
+    else:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id] for i in range(num_levels)
+        ]
+    return mlvl_tensor_list
+
+
+def filter_scores_and_topk(scores, score_thr, topk, results=None):
+    """Filter results using score threshold and topk candidates.
+
+    Args:
+        scores (Tensor): The scores, shape (num_bboxes, K).
+        score_thr (float): The score filter threshold.
+        topk (int): The number of topk candidates.
+        results (dict or list or Tensor, Optional): The results to
+           which the filtering rule is to be applied. The shape
+           of each item is (num_bboxes, N).
+
+    Returns:
+        tuple: Filtered results
+
+            - scores (Tensor): The scores after being filtered, \
+                shape (num_bboxes_filtered, ).
+            - labels (Tensor): The class labels, shape \
+                (num_bboxes_filtered, ).
+            - anchor_idxs (Tensor): The anchor indexes, shape \
+                (num_bboxes_filtered, ).
+            - filtered_results (dict or list or Tensor, Optional): \
+                The filtered results. The shape of each item is \
+                (num_bboxes_filtered, N).
+    """
+    valid_mask = scores > score_thr
+    scores = scores[valid_mask]
+    valid_idxs = torch.nonzero(valid_mask)
+
+    num_topk = min(topk, valid_idxs.size(0))
+    # torch.sort is actually faster than .topk (at least on GPUs)
+    scores, idxs = scores.sort(descending=True)
+    scores = scores[:num_topk]
+    topk_idxs = valid_idxs[idxs[:num_topk]]
+    keep_idxs, labels = topk_idxs.unbind(dim=1)
+
+    filtered_results = None
+    if results is not None:
+        if isinstance(results, dict):
+            filtered_results = {k: v[keep_idxs] for k, v in results.items()}
+        elif isinstance(results, list):
+            filtered_results = [result[keep_idxs] for result in results]
+        elif isinstance(results, torch.Tensor):
+            filtered_results = results[keep_idxs]
+        else:
+            raise NotImplementedError(f'Only supports dict or list or Tensor, '
+                                      f'but get {type(results)}.')
+    return scores, labels, keep_idxs, filtered_results
+
+
+def center_of_mass(mask, esp=1e-6):
+    """Calculate the centroid coordinates of the mask.
+
+    Args:
+        mask (Tensor): The mask to be calculated, shape (h, w).
+        esp (float): Avoid dividing by zero. Default: 1e-6.
+
+    Returns:
+        tuple[Tensor]: the coordinates of the center point of the mask.
+
+            - center_h (Tensor): the center point of the height.
+            - center_w (Tensor): the center point of the width.
+    """
+    h, w = mask.shape
+    grid_h = torch.arange(h, device=mask.device)[:, None]
+    grid_w = torch.arange(w, device=mask.device)
+    normalizer = mask.sum().float().clamp(min=esp)
+    center_h = (mask * grid_h).sum() / normalizer
+    center_w = (mask * grid_w).sum() / normalizer
+    return center_h, center_w
+
+
+def generate_coordinate(featmap_sizes, device='cuda'):
+    """Generate the coordinate.
+
+    Args:
+        featmap_sizes (tuple): The feature to be calculated,
+            of shape (N, C, W, H).
+        device (str): The device where the feature will be put on.
+    Returns:
+        coord_feat (Tensor): The coordinate feature, of shape (N, 2, W, H).
+    """
+
+    x_range = torch.linspace(-1, 1, featmap_sizes[-1], device=device)
+    y_range = torch.linspace(-1, 1, featmap_sizes[-2], device=device)
+    y, x = torch.meshgrid(y_range, x_range)
+    y = y.expand([featmap_sizes[0], 1, -1, -1])
+    x = x.expand([featmap_sizes[0], 1, -1, -1])
+    coord_feat = torch.cat([x, y], 1)
+
+    return coord_feat
diff --git a/mmdet/core/visualization/__init__.py b/mmdet/core/visualization/__init__.py
new file mode 100755
index 0000000..2eb17c4
--- /dev/null
+++ b/mmdet/core/visualization/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .image import (color_val_matplotlib, imshow_det_bboxes,
+                    imshow_gt_det_bboxes)
+from .palette import get_palette, palette_val
+
+__all__ = [
+    'imshow_det_bboxes', 'imshow_gt_det_bboxes', 'color_val_matplotlib',
+    'palette_val', 'get_palette'
+]
diff --git a/mmdet/core/visualization/image.py b/mmdet/core/visualization/image.py
new file mode 100755
index 0000000..63eae8a
--- /dev/null
+++ b/mmdet/core/visualization/image.py
@@ -0,0 +1,563 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+
+import cv2
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+
+from mmdet.core.evaluation.panoptic_utils import INSTANCE_OFFSET
+from ..mask.structures import bitmap_to_polygon
+from ..utils import mask2ndarray
+from .palette import get_palette, palette_val
+
+__all__ = [
+    'color_val_matplotlib', 'draw_masks', 'draw_bboxes', 'draw_labels',
+    'imshow_det_bboxes', 'imshow_gt_det_bboxes'
+]
+
+EPS = 1e-2
+
+
+def color_val_matplotlib(color):
+    """Convert various input in BGR order to normalized RGB matplotlib color
+    tuples.
+
+    Args:
+        color (:obj`Color` | str | tuple | int | ndarray): Color inputs.
+
+    Returns:
+        tuple[float]: A tuple of 3 normalized floats indicating RGB channels.
+    """
+    color = mmcv.color_val(color)
+    color = [color / 255 for color in color[::-1]]
+    return tuple(color)
+
+
+def _get_adaptive_scales(areas, min_area=800, max_area=30000):
+    """Get adaptive scales according to areas.
+
+    The scale range is [0.5, 1.0]. When the area is less than
+    ``'min_area'``, the scale is 0.5 while the area is larger than
+    ``'max_area'``, the scale is 1.0.
+
+    Args:
+        areas (ndarray): The areas of bboxes or masks with the
+            shape of (n, ).
+        min_area (int): Lower bound areas for adaptive scales.
+            Default: 800.
+        max_area (int): Upper bound areas for adaptive scales.
+            Default: 30000.
+
+    Returns:
+        ndarray: The adaotive scales with the shape of (n, ).
+    """
+    scales = 0.5 + (areas - min_area) / (max_area - min_area)
+    scales = np.clip(scales, 0.5, 1.0)
+    return scales
+
+
+def _get_bias_color(base, max_dist=30):
+    """Get different colors for each masks.
+
+    Get different colors for each masks by adding a bias
+    color to the base category color.
+    Args:
+        base (ndarray): The base category color with the shape
+            of (3, ).
+        max_dist (int): The max distance of bias. Default: 30.
+
+    Returns:
+        ndarray: The new color for a mask with the shape of (3, ).
+    """
+    new_color = base + np.random.randint(
+        low=-max_dist, high=max_dist + 1, size=3)
+    return np.clip(new_color, 0, 255, new_color)
+
+
+def draw_bboxes(ax, bboxes, color='g', alpha=0.8, thickness=2):
+    """Draw bounding boxes on the axes.
+
+    Args:
+        ax (matplotlib.Axes): The input axes.
+        bboxes (ndarray): The input bounding boxes with the shape
+            of (n, 4).
+        color (list[tuple] | matplotlib.color): the colors for each
+            bounding boxes.
+        alpha (float): Transparency of bounding boxes. Default: 0.8.
+        thickness (int): Thickness of lines. Default: 2.
+
+    Returns:
+        matplotlib.Axes: The result axes.
+    """
+    polygons = []
+    for i, bbox in enumerate(bboxes):
+        bbox_int = bbox.astype(np.int32)
+        poly = [[bbox_int[0], bbox_int[1]], [bbox_int[0], bbox_int[3]],
+                [bbox_int[2], bbox_int[3]], [bbox_int[2], bbox_int[1]]]
+        np_poly = np.array(poly).reshape((4, 2))
+        polygons.append(Polygon(np_poly))
+    p = PatchCollection(
+        polygons,
+        facecolor='none',
+        edgecolors=color,
+        linewidths=thickness,
+        alpha=alpha)
+    ax.add_collection(p)
+
+    return ax
+
+
+def draw_labels(ax,
+                labels,
+                positions,
+                scores=None,
+                class_names=None,
+                color='w',
+                font_size=8,
+                scales=None,
+                horizontal_alignment='left'):
+    """Draw labels on the axes.
+
+    Args:
+        ax (matplotlib.Axes): The input axes.
+        labels (ndarray): The labels with the shape of (n, ).
+        positions (ndarray): The positions to draw each labels.
+        scores (ndarray): The scores for each labels.
+        class_names (list[str]): The class names.
+        color (list[tuple] | matplotlib.color): The colors for labels.
+        font_size (int): Font size of texts. Default: 8.
+        scales (list[float]): Scales of texts. Default: None.
+        horizontal_alignment (str): The horizontal alignment method of
+            texts. Default: 'left'.
+
+    Returns:
+        matplotlib.Axes: The result axes.
+    """
+    for i, (pos, label) in enumerate(zip(positions, labels)):
+        label_text = class_names[
+            label] if class_names is not None else f'class {label}'
+        if scores is not None:
+            label_text += f'|{scores[i]:.02f}'
+        text_color = color[i] if isinstance(color, list) else color
+
+        font_size_mask = font_size if scales is None else font_size * scales[i]
+        ax.text(
+            pos[0],
+            pos[1],
+            f'{label_text}',
+            bbox={
+                'facecolor': 'black',
+                'alpha': 0.8,
+                'pad': 0.7,
+                'edgecolor': 'none'
+            },
+            color=text_color,
+            fontsize=font_size_mask,
+            verticalalignment='top',
+            horizontalalignment=horizontal_alignment)
+
+    return ax
+
+
+def draw_masks(ax, img, masks, color=None, with_edge=True, alpha=0.8):
+    """Draw masks on the image and their edges on the axes.
+
+    Args:
+        ax (matplotlib.Axes): The input axes.
+        img (ndarray): The image with the shape of (3, h, w).
+        masks (ndarray): The masks with the shape of (n, h, w).
+        color (ndarray): The colors for each masks with the shape
+            of (n, 3).
+        with_edge (bool): Whether to draw edges. Default: True.
+        alpha (float): Transparency of bounding boxes. Default: 0.8.
+
+    Returns:
+        matplotlib.Axes: The result axes.
+        ndarray: The result image.
+    """
+    taken_colors = set([0, 0, 0])
+    if color is None:
+        random_colors = np.random.randint(0, 255, (masks.size(0), 3))
+        color = [tuple(c) for c in random_colors]
+        color = np.array(color, dtype=np.uint8)
+    polygons = []
+    for i, mask in enumerate(masks):
+        if with_edge:
+            contours, _ = bitmap_to_polygon(mask)
+            polygons += [Polygon(c) for c in contours]
+
+        color_mask = color[i]
+        while tuple(color_mask) in taken_colors:
+            color_mask = _get_bias_color(color_mask)
+        taken_colors.add(tuple(color_mask))
+
+        mask = mask.astype(bool)
+        img[mask] = img[mask] * (1 - alpha) + color_mask * alpha
+
+    p = PatchCollection(
+        polygons, facecolor='none', edgecolors='w', linewidths=1, alpha=0.8)
+    ax.add_collection(p)
+
+    return ax, img
+
+
+def imshow_det_bboxes(img,
+                      bboxes=None,
+                      labels=None,
+                      segms=None,
+                      class_names=None,
+                      score_thr=0,
+                      bbox_color='green',
+                      text_color='green',
+                      mask_color=None,
+                      thickness=2,
+                      font_size=8,
+                      win_name='',
+                      show=True,
+                      wait_time=0,
+                      out_file=None):
+    """Draw bboxes and class labels (with scores) on an image.
+
+    Args:
+        img (str | ndarray): The image to be displayed.
+        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5).
+        labels (ndarray): Labels of bboxes.
+        segms (ndarray | None): Masks, shaped (n,h,w) or None.
+        class_names (list[str]): Names of each classes.
+        score_thr (float): Minimum score of bboxes to be shown. Default: 0.
+        bbox_color (list[tuple] | tuple | str | None): Colors of bbox lines.
+           If a single color is given, it will be applied to all classes.
+           The tuple of color should be in RGB order. Default: 'green'.
+        text_color (list[tuple] | tuple | str | None): Colors of texts.
+           If a single color is given, it will be applied to all classes.
+           The tuple of color should be in RGB order. Default: 'green'.
+        mask_color (list[tuple] | tuple | str | None, optional): Colors of
+           masks. If a single color is given, it will be applied to all
+           classes. The tuple of color should be in RGB order.
+           Default: None.
+        thickness (int): Thickness of lines. Default: 2.
+        font_size (int): Font size of texts. Default: 13.
+        show (bool): Whether to show the image. Default: True.
+        win_name (str): The window name. Default: ''.
+        wait_time (float): Value of waitKey param. Default: 0.
+        out_file (str, optional): The filename to write the image.
+            Default: None.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    assert bboxes is None or bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert labels.ndim == 1, \
+        f' labels ndim should be 1, but its ndim is {labels.ndim}.'
+    assert bboxes is None or bboxes.shape[1] == 4 or bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 4 or 5, but its {bboxes.shape[1]}.'
+    assert bboxes is None or bboxes.shape[0] <= labels.shape[0], \
+        'labels.shape[0] should not be less than bboxes.shape[0].'
+    assert segms is None or segms.shape[0] == labels.shape[0], \
+        'segms.shape[0] and labels.shape[0] should have the same length.'
+    assert segms is not None or bboxes is not None, \
+        'segms and bboxes should not be None at the same time.'
+
+    img = mmcv.imread(img).astype(np.uint8)
+
+    if score_thr > 0:
+        assert bboxes is not None and bboxes.shape[1] == 5
+        scores = bboxes[:, -1]
+        inds = scores > score_thr
+        bboxes = bboxes[inds, :]
+        labels = labels[inds]
+        if segms is not None:
+            segms = segms[inds, ...]
+
+    img = mmcv.bgr2rgb(img)
+    width, height = img.shape[1], img.shape[0]
+    img = np.ascontiguousarray(img)
+
+    fig = plt.figure(win_name, frameon=False)
+    plt.title(win_name)
+    canvas = fig.canvas
+    dpi = fig.get_dpi()
+    # add a small EPS to avoid precision lost due to matplotlib's truncation
+    # (https://github.com/matplotlib/matplotlib/issues/15363)
+    fig.set_size_inches((width + EPS) / dpi, (height + EPS) / dpi)
+
+    # remove white edges by set subplot margin
+    plt.subplots_adjust(left=0, right=1, bottom=0, top=1)
+    ax = plt.gca()
+    ax.axis('off')
+
+    max_label = int(max(labels) if len(labels) > 0 else 0)
+    text_palette = palette_val(get_palette(text_color, max_label + 1))
+    text_colors = [text_palette[label] for label in labels]
+
+    num_bboxes = 0
+    if bboxes is not None:
+        num_bboxes = bboxes.shape[0]
+        bbox_palette = palette_val(get_palette(bbox_color, max_label + 1))
+        colors = [bbox_palette[label] for label in labels[:num_bboxes]]
+        draw_bboxes(ax, bboxes, colors, alpha=0.8, thickness=thickness)
+
+        horizontal_alignment = 'left'
+        positions = bboxes[:, :2].astype(np.int32) + thickness
+        areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
+        scales = _get_adaptive_scales(areas)
+        scores = bboxes[:, 4] if bboxes.shape[1] == 5 else None
+        draw_labels(
+            ax,
+            labels[:num_bboxes],
+            positions,
+            scores=scores,
+            class_names=class_names,
+            color=text_colors,
+            font_size=font_size,
+            scales=scales,
+            horizontal_alignment=horizontal_alignment)
+
+    if segms is not None:
+        mask_palette = get_palette(mask_color, max_label + 1)
+        colors = [mask_palette[label] for label in labels]
+        colors = np.array(colors, dtype=np.uint8)
+        draw_masks(ax, img, segms, colors, with_edge=True)
+
+        if num_bboxes < segms.shape[0]:
+            segms = segms[num_bboxes:]
+            horizontal_alignment = 'center'
+            areas = []
+            positions = []
+            for mask in segms:
+                _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                    mask.astype(np.uint8), connectivity=8)
+                largest_id = np.argmax(stats[1:, -1]) + 1
+                positions.append(centroids[largest_id])
+                areas.append(stats[largest_id, -1])
+            areas = np.stack(areas, axis=0)
+            scales = _get_adaptive_scales(areas)
+            draw_labels(
+                ax,
+                labels[num_bboxes:],
+                positions,
+                class_names=class_names,
+                color=text_colors,
+                font_size=font_size,
+                scales=scales,
+                horizontal_alignment=horizontal_alignment)
+
+    plt.imshow(img)
+
+    stream, _ = canvas.print_to_buffer()
+    buffer = np.frombuffer(stream, dtype='uint8')
+    if sys.platform == 'darwin':
+        width, height = canvas.get_width_height(physical=True)
+    img_rgba = buffer.reshape(height, width, 4)
+    rgb, alpha = np.split(img_rgba, [3], axis=2)
+    img = rgb.astype('uint8')
+    img = mmcv.rgb2bgr(img)
+
+    if show:
+        # We do not use cv2 for display because in some cases, opencv will
+        # conflict with Qt, it will output a warning: Current thread
+        # is not the object's thread. You can refer to
+        # https://github.com/opencv/opencv-python/issues/46 for details
+        if wait_time == 0:
+            plt.show()
+        else:
+            plt.show(block=False)
+            plt.pause(wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+
+    plt.close()
+
+    return img
+
+
+def imshow_gt_det_bboxes(img,
+                         annotation,
+                         result,
+                         class_names=None,
+                         score_thr=0,
+                         gt_bbox_color=(61, 102, 255),
+                         gt_text_color=(200, 200, 200),
+                         gt_mask_color=(61, 102, 255),
+                         det_bbox_color=(241, 101, 72),
+                         det_text_color=(200, 200, 200),
+                         det_mask_color=(241, 101, 72),
+                         thickness=2,
+                         font_size=13,
+                         win_name='',
+                         show=True,
+                         wait_time=0,
+                         out_file=None,
+                         overlay_gt_pred=True):
+    """General visualization GT and result function.
+
+    Args:
+      img (str | ndarray): The image to be displayed.
+      annotation (dict): Ground truth annotations where contain keys of
+          'gt_bboxes' and 'gt_labels' or 'gt_masks'.
+      result (tuple[list] | list): The detection result, can be either
+          (bbox, segm) or just bbox.
+      class_names (list[str]): Names of each classes.
+      score_thr (float): Minimum score of bboxes to be shown. Default: 0.
+      gt_bbox_color (list[tuple] | tuple | str | None): Colors of bbox lines.
+          If a single color is given, it will be applied to all classes.
+          The tuple of color should be in RGB order. Default: (61, 102, 255).
+      gt_text_color (list[tuple] | tuple | str | None): Colors of texts.
+          If a single color is given, it will be applied to all classes.
+          The tuple of color should be in RGB order. Default: (200, 200, 200).
+      gt_mask_color (list[tuple] | tuple | str | None, optional): Colors of
+          masks. If a single color is given, it will be applied to all classes.
+          The tuple of color should be in RGB order. Default: (61, 102, 255).
+      det_bbox_color (list[tuple] | tuple | str | None):Colors of bbox lines.
+          If a single color is given, it will be applied to all classes.
+          The tuple of color should be in RGB order. Default: (241, 101, 72).
+      det_text_color (list[tuple] | tuple | str | None):Colors of texts.
+          If a single color is given, it will be applied to all classes.
+          The tuple of color should be in RGB order. Default: (200, 200, 200).
+      det_mask_color (list[tuple] | tuple | str | None, optional): Color of
+          masks. If a single color is given, it will be applied to all classes.
+          The tuple of color should be in RGB order. Default: (241, 101, 72).
+      thickness (int): Thickness of lines. Default: 2.
+      font_size (int): Font size of texts. Default: 13.
+      win_name (str): The window name. Default: ''.
+      show (bool): Whether to show the image. Default: True.
+      wait_time (float): Value of waitKey param. Default: 0.
+      out_file (str, optional): The filename to write the image.
+          Default: None.
+      overlay_gt_pred (bool): Whether to plot gts and predictions on the
+       same image. If False, predictions and gts will be plotted on two same
+       image which will be concatenated in vertical direction. The image
+       above is drawn with gt, and the image below is drawn with the
+       prediction result. Default: True.
+
+    Returns:
+        ndarray: The image with bboxes or masks drawn on it.
+    """
+    assert 'gt_bboxes' in annotation
+    assert 'gt_labels' in annotation
+    assert isinstance(result, (tuple, list, dict)), 'Expected ' \
+        f'tuple or list or dict, but get {type(result)}'
+
+    gt_bboxes = annotation['gt_bboxes']
+    gt_labels = annotation['gt_labels']
+    gt_masks = annotation.get('gt_masks', None)
+    if gt_masks is not None:
+        gt_masks = mask2ndarray(gt_masks)
+
+    gt_seg = annotation.get('gt_semantic_seg', None)
+    if gt_seg is not None:
+        pad_value = 255  # the padding value of gt_seg
+        sem_labels = np.unique(gt_seg)
+        all_labels = np.concatenate((gt_labels, sem_labels), axis=0)
+        all_labels, counts = np.unique(all_labels, return_counts=True)
+        stuff_labels = all_labels[np.logical_and(counts < 2,
+                                                 all_labels != pad_value)]
+        stuff_masks = gt_seg[None] == stuff_labels[:, None, None]
+        gt_labels = np.concatenate((gt_labels, stuff_labels), axis=0)
+        gt_masks = np.concatenate((gt_masks, stuff_masks.astype(np.uint8)),
+                                  axis=0)
+        # If you need to show the bounding boxes,
+        # please comment the following line
+        # gt_bboxes = None
+
+    img = mmcv.imread(img)
+
+    img_with_gt = imshow_det_bboxes(
+        img,
+        gt_bboxes,
+        gt_labels,
+        gt_masks,
+        class_names=class_names,
+        bbox_color=gt_bbox_color,
+        text_color=gt_text_color,
+        mask_color=gt_mask_color,
+        thickness=thickness,
+        font_size=font_size,
+        win_name=win_name,
+        show=False)
+
+    if not isinstance(result, dict):
+        if isinstance(result, tuple):
+            bbox_result, segm_result = result
+            if isinstance(segm_result, tuple):
+                segm_result = segm_result[0]  # ms rcnn
+        else:
+            bbox_result, segm_result = result, None
+
+        bboxes = np.vstack(bbox_result)
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_result)
+        ]
+        labels = np.concatenate(labels)
+
+        segms = None
+        if segm_result is not None and len(labels) > 0:  # non empty
+            segms = mmcv.concat_list(segm_result)
+            segms = mask_util.decode(segms)
+            segms = segms.transpose(2, 0, 1)
+    else:
+        assert class_names is not None, 'We need to know the number ' \
+                                        'of classes.'
+        VOID = len(class_names)
+        bboxes = None
+        pan_results = result['pan_results']
+        # keep objects ahead
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != VOID
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+
+    if overlay_gt_pred:
+        img = imshow_det_bboxes(
+            img_with_gt,
+            bboxes,
+            labels,
+            segms=segms,
+            class_names=class_names,
+            score_thr=score_thr,
+            bbox_color=det_bbox_color,
+            text_color=det_text_color,
+            mask_color=det_mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+    else:
+        img_with_det = imshow_det_bboxes(
+            img,
+            bboxes,
+            labels,
+            segms=segms,
+            class_names=class_names,
+            score_thr=score_thr,
+            bbox_color=det_bbox_color,
+            text_color=det_text_color,
+            mask_color=det_mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=False)
+        img = np.concatenate([img_with_gt, img_with_det], axis=0)
+
+        plt.imshow(img)
+        if show:
+            if wait_time == 0:
+                plt.show()
+            else:
+                plt.show(block=False)
+                plt.pause(wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+        plt.close()
+
+    return img
diff --git a/mmdet/core/visualization/palette.py b/mmdet/core/visualization/palette.py
new file mode 100755
index 0000000..11692cd
--- /dev/null
+++ b/mmdet/core/visualization/palette.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+
+
+def palette_val(palette):
+    """Convert palette to matplotlib palette.
+
+    Args:
+        palette List[tuple]: A list of color tuples.
+
+    Returns:
+        List[tuple[float]]: A list of RGB matplotlib color tuples.
+    """
+    new_palette = []
+    for color in palette:
+        color = [c / 255 for c in color]
+        new_palette.append(tuple(color))
+    return new_palette
+
+
+def get_palette(palette, num_classes):
+    """Get palette from various inputs.
+
+    Args:
+        palette (list[tuple] | str | tuple | :obj:`Color`): palette inputs.
+        num_classes (int): the number of classes.
+
+    Returns:
+        list[tuple[int]]: A list of color tuples.
+    """
+    assert isinstance(num_classes, int)
+
+    if isinstance(palette, list):
+        dataset_palette = palette
+    elif isinstance(palette, tuple):
+        dataset_palette = [palette] * num_classes
+    elif palette == 'random' or palette is None:
+        state = np.random.get_state()
+        # random color
+        np.random.seed(42)
+        palette = np.random.randint(0, 256, size=(num_classes, 3))
+        np.random.set_state(state)
+        dataset_palette = [tuple(c) for c in palette]
+    elif palette == 'coco':
+        from mmdet.datasets import CocoDataset, CocoPanopticDataset
+        dataset_palette = CocoDataset.PALETTE
+        if len(dataset_palette) < num_classes:
+            dataset_palette = CocoPanopticDataset.PALETTE
+    elif palette == 'citys':
+        from mmdet.datasets import CityscapesDataset
+        dataset_palette = CityscapesDataset.PALETTE
+    elif palette == 'voc':
+        from mmdet.datasets import VOCDataset
+        dataset_palette = VOCDataset.PALETTE
+    elif mmcv.is_str(palette):
+        dataset_palette = [mmcv.color_val(palette)[::-1]] * num_classes
+    else:
+        raise TypeError(f'Invalid type for palette: {type(palette)}')
+
+    assert len(dataset_palette) >= num_classes, \
+        'The length of palette should not be less than `num_classes`.'
+    return dataset_palette
diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py
new file mode 100755
index 0000000..32e1542
--- /dev/null
+++ b/mmdet/datasets/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .cityscapes import CityscapesDataset
+from .coco import CocoDataset
+from .coco_occluded import OccludedSeparatedCocoDataset
+from .coco_panoptic import CocoPanopticDataset
+from .custom import CustomDataset
+from .dataset_wrappers import (ClassBalancedDataset, ConcatDataset,
+                               MultiImageMixDataset, RepeatDataset)
+from .deepfashion import DeepFashionDataset
+from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
+from .objects365 import Objects365V1Dataset, Objects365V2Dataset
+from .openimages import OpenImagesChallengeDataset, OpenImagesDataset
+from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+from .utils import (NumClassCheckHook, get_loading_pipeline,
+                    replace_ImageToTensor)
+from .voc import VOCDataset
+from .wider_face import WIDERFaceDataset
+from .xml_style import XMLDataset
+from .diverseweather import DiverseWeatherDataset
+
+__all__ = [
+    'CustomDataset', 'XMLDataset', 'CocoDataset', 'DeepFashionDataset',
+    'VOCDataset', 'CityscapesDataset', 'LVISDataset', 'LVISV05Dataset',
+    'LVISV1Dataset', 'GroupSampler', 'DistributedGroupSampler',
+    'DistributedSampler', 'build_dataloader', 'ConcatDataset', 'RepeatDataset',
+    'ClassBalancedDataset', 'WIDERFaceDataset', 'DATASETS', 'PIPELINES',
+    'build_dataset', 'replace_ImageToTensor', 'get_loading_pipeline',
+    'NumClassCheckHook', 'CocoPanopticDataset', 'MultiImageMixDataset',
+    'OpenImagesDataset', 'OpenImagesChallengeDataset', 'Objects365V1Dataset',
+    'Objects365V2Dataset', 'OccludedSeparatedCocoDataset', 'DiverseWeatherDataset'
+]
diff --git a/mmdet/datasets/api_wrappers/__init__.py b/mmdet/datasets/api_wrappers/__init__.py
new file mode 100755
index 0000000..af85575
--- /dev/null
+++ b/mmdet/datasets/api_wrappers/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coco_api import COCO, COCOeval
+from .panoptic_evaluation import pq_compute_multi_core, pq_compute_single_core
+
+__all__ = [
+    'COCO', 'COCOeval', 'pq_compute_multi_core', 'pq_compute_single_core'
+]
diff --git a/mmdet/datasets/api_wrappers/coco_api.py b/mmdet/datasets/api_wrappers/coco_api.py
new file mode 100755
index 0000000..eef6341
--- /dev/null
+++ b/mmdet/datasets/api_wrappers/coco_api.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This file add snake case alias for coco api
+
+import warnings
+
+import pycocotools
+from pycocotools.coco import COCO as _COCO
+from pycocotools.cocoeval import COCOeval as _COCOeval
+
+
+class COCO(_COCO):
+    """This class is almost the same as official pycocotools package.
+
+    It implements some snake case function aliases. So that the COCO class has
+    the same interface as LVIS class.
+    """
+
+    def __init__(self, annotation_file=None):
+        if getattr(pycocotools, '__version__', '0') >= '12.0.2':
+            warnings.warn(
+                'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"',  # noqa: E501
+                UserWarning)
+        super().__init__(annotation_file=annotation_file)
+        self.img_ann_map = self.imgToAnns
+        self.cat_img_map = self.catToImgs
+
+    def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None):
+        return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd)
+
+    def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]):
+        return self.getCatIds(cat_names, sup_names, cat_ids)
+
+    def get_img_ids(self, img_ids=[], cat_ids=[]):
+        return self.getImgIds(img_ids, cat_ids)
+
+    def load_anns(self, ids):
+        return self.loadAnns(ids)
+
+    def load_cats(self, ids):
+        return self.loadCats(ids)
+
+    def load_imgs(self, ids):
+        return self.loadImgs(ids)
+
+
+# just for the ease of import
+COCOeval = _COCOeval
diff --git a/mmdet/datasets/api_wrappers/panoptic_evaluation.py b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
new file mode 100755
index 0000000..55f57bf
--- /dev/null
+++ b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
@@ -0,0 +1,228 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Copyright (c) 2018, Alexander Kirillov
+# This file supports `file_client` for `panopticapi`,
+# the source code is copied from `panopticapi`,
+# only the way to load the gt images is modified.
+import multiprocessing
+import os
+
+import mmcv
+import numpy as np
+
+try:
+    from panopticapi.evaluation import OFFSET, VOID, PQStat
+    from panopticapi.utils import rgb2id
+except ImportError:
+    PQStat = None
+    rgb2id = None
+    VOID = 0
+    OFFSET = 256 * 256 * 256
+
+
+def pq_compute_single_core(proc_id,
+                           annotation_set,
+                           gt_folder,
+                           pred_folder,
+                           categories,
+                           file_client=None,
+                           print_log=False):
+    """The single core function to evaluate the metric of Panoptic
+    Segmentation.
+
+    Same as the function with the same name in `panopticapi`. Only the function
+    to load the images is changed to use the file client.
+
+    Args:
+        proc_id (int): The id of the mini process.
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        file_client (object): The file client of the dataset. If None,
+            the backend will be set to `disk`.
+        print_log (bool): Whether to print the log. Defaults to False.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    if file_client is None:
+        file_client_args = dict(backend='disk')
+        file_client = mmcv.FileClient(**file_client_args)
+
+    pq_stat = PQStat()
+
+    idx = 0
+    for gt_ann, pred_ann in annotation_set:
+        if print_log and idx % 100 == 0:
+            print('Core: {}, {} from {} images processed'.format(
+                proc_id, idx, len(annotation_set)))
+        idx += 1
+        # The gt images can be on the local disk or `ceph`, so we use
+        # file_client here.
+        img_bytes = file_client.get(
+            os.path.join(gt_folder, gt_ann['file_name']))
+        pan_gt = mmcv.imfrombytes(img_bytes, flag='color', channel_order='rgb')
+        pan_gt = rgb2id(pan_gt)
+
+        # The predictions can only be on the local dist now.
+        pan_pred = mmcv.imread(
+            os.path.join(pred_folder, pred_ann['file_name']),
+            flag='color',
+            channel_order='rgb')
+        pan_pred = rgb2id(pan_pred)
+
+        gt_segms = {el['id']: el for el in gt_ann['segments_info']}
+        pred_segms = {el['id']: el for el in pred_ann['segments_info']}
+
+        # predicted segments area calculation + prediction sanity checks
+        pred_labels_set = set(el['id'] for el in pred_ann['segments_info'])
+        labels, labels_cnt = np.unique(pan_pred, return_counts=True)
+        for label, label_cnt in zip(labels, labels_cnt):
+            if label not in pred_segms:
+                if label == VOID:
+                    continue
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} is '
+                    'presented in PNG and not presented in JSON.'.format(
+                        gt_ann['image_id'], label))
+            pred_segms[label]['area'] = label_cnt
+            pred_labels_set.remove(label)
+            if pred_segms[label]['category_id'] not in categories:
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} has '
+                    'unknown category_id {}.'.format(
+                        gt_ann['image_id'], label,
+                        pred_segms[label]['category_id']))
+        if len(pred_labels_set) != 0:
+            raise KeyError(
+                'In the image with ID {} the following segment IDs {} '
+                'are presented in JSON and not presented in PNG.'.format(
+                    gt_ann['image_id'], list(pred_labels_set)))
+
+        # confusion matrix calculation
+        pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(
+            np.uint64)
+        gt_pred_map = {}
+        labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True)
+        for label, intersection in zip(labels, labels_cnt):
+            gt_id = label // OFFSET
+            pred_id = label % OFFSET
+            gt_pred_map[(gt_id, pred_id)] = intersection
+
+        # count all matched pairs
+        gt_matched = set()
+        pred_matched = set()
+        for label_tuple, intersection in gt_pred_map.items():
+            gt_label, pred_label = label_tuple
+            if gt_label not in gt_segms:
+                continue
+            if pred_label not in pred_segms:
+                continue
+            if gt_segms[gt_label]['iscrowd'] == 1:
+                continue
+            if gt_segms[gt_label]['category_id'] != pred_segms[pred_label][
+                    'category_id']:
+                continue
+
+            union = pred_segms[pred_label]['area'] + gt_segms[gt_label][
+                'area'] - intersection - gt_pred_map.get((VOID, pred_label), 0)
+            iou = intersection / union
+            if iou > 0.5:
+                pq_stat[gt_segms[gt_label]['category_id']].tp += 1
+                pq_stat[gt_segms[gt_label]['category_id']].iou += iou
+                gt_matched.add(gt_label)
+                pred_matched.add(pred_label)
+
+        # count false positives
+        crowd_labels_dict = {}
+        for gt_label, gt_info in gt_segms.items():
+            if gt_label in gt_matched:
+                continue
+            # crowd segments are ignored
+            if gt_info['iscrowd'] == 1:
+                crowd_labels_dict[gt_info['category_id']] = gt_label
+                continue
+            pq_stat[gt_info['category_id']].fn += 1
+
+        # count false positives
+        for pred_label, pred_info in pred_segms.items():
+            if pred_label in pred_matched:
+                continue
+            # intersection of the segment with VOID
+            intersection = gt_pred_map.get((VOID, pred_label), 0)
+            # plus intersection with corresponding CROWD region if it exists
+            if pred_info['category_id'] in crowd_labels_dict:
+                intersection += gt_pred_map.get(
+                    (crowd_labels_dict[pred_info['category_id']], pred_label),
+                    0)
+            # predicted segment is ignored if more than half of
+            # the segment correspond to VOID and CROWD regions
+            if intersection / pred_info['area'] > 0.5:
+                continue
+            pq_stat[pred_info['category_id']].fp += 1
+
+    if print_log:
+        print('Core: {}, all {} images processed'.format(
+            proc_id, len(annotation_set)))
+    return pq_stat
+
+
+def pq_compute_multi_core(matched_annotations_list,
+                          gt_folder,
+                          pred_folder,
+                          categories,
+                          file_client=None,
+                          nproc=32):
+    """Evaluate the metrics of Panoptic Segmentation with multithreading.
+
+    Same as the function with the same name in `panopticapi`.
+
+    Args:
+        matched_annotations_list (list): The matched annotation list. Each
+            element is a tuple of annotations of the same image with the
+            format (gt_anns, pred_anns).
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        file_client (object): The file client of the dataset. If None,
+            the backend will be set to `disk`.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When `nproc` exceeds the number of cpu cores,
+            the number of cpu cores is used.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    if file_client is None:
+        file_client_args = dict(backend='disk')
+        file_client = mmcv.FileClient(**file_client_args)
+
+    cpu_num = min(nproc, multiprocessing.cpu_count())
+
+    annotations_split = np.array_split(matched_annotations_list, cpu_num)
+    print('Number of cores: {}, images per core: {}'.format(
+        cpu_num, len(annotations_split[0])))
+    workers = multiprocessing.Pool(processes=cpu_num)
+    processes = []
+    for proc_id, annotation_set in enumerate(annotations_split):
+        p = workers.apply_async(pq_compute_single_core,
+                                (proc_id, annotation_set, gt_folder,
+                                 pred_folder, categories, file_client))
+        processes.append(p)
+
+    # Close the process pool, otherwise it will lead to memory
+    # leaking problems.
+    workers.close()
+    workers.join()
+
+    pq_stat = PQStat()
+    for p in processes:
+        pq_stat += p.get()
+
+    return pq_stat
diff --git a/mmdet/datasets/builder.py b/mmdet/datasets/builder.py
new file mode 100755
index 0000000..1936296
--- /dev/null
+++ b/mmdet/datasets/builder.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import platform
+import random
+import warnings
+from functools import partial
+
+import numpy as np
+import torch
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import TORCH_VERSION, Registry, build_from_cfg, digit_version
+from torch.utils.data import DataLoader
+
+from .samplers import (ClassAwareSampler, DistributedGroupSampler,
+                       DistributedSampler, GroupSampler, InfiniteBatchSampler,
+                       InfiniteGroupBatchSampler)
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+
+
+def _concat_dataset(cfg, default_args=None):
+    from .dataset_wrappers import ConcatDataset
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    seg_prefixes = cfg.get('seg_prefix', None)
+    proposal_files = cfg.get('proposal_file', None)
+    separate_eval = cfg.get('separate_eval', True)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        # pop 'separate_eval' since it is not a valid key for common datasets.
+        if 'separate_eval' in data_cfg:
+            data_cfg.pop('separate_eval')
+        data_cfg['ann_file'] = ann_files[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            data_cfg['img_prefix'] = img_prefixes[i]
+        if isinstance(seg_prefixes, (list, tuple)):
+            data_cfg['seg_prefix'] = seg_prefixes[i]
+        if isinstance(proposal_files, (list, tuple)):
+            data_cfg['proposal_file'] = proposal_files[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets, separate_eval)
+
+
+def build_dataset(cfg, default_args=None):
+    from .dataset_wrappers import (ClassBalancedDataset, ConcatDataset,
+                                   MultiImageMixDataset, RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif cfg['type'] == 'MultiImageMixDataset':
+        cp_cfg = copy.deepcopy(cfg)
+        cp_cfg['dataset'] = build_dataset(cp_cfg['dataset'])
+        cp_cfg.pop('type')
+        dataset = MultiImageMixDataset(**cp_cfg)
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     runner_type='EpochBasedRunner',
+                     persistent_workers=False,
+                     class_aware_sampler=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        seed (int, Optional): Seed to be used. Default: None.
+        runner_type (str): Type of runner. Default: `EpochBasedRunner`
+        persistent_workers (bool): If True, the data loader will not shutdown
+            the worker processes after a dataset has been consumed once.
+            This allows to maintain the workers `Dataset` instances alive.
+            This argument is only valid when PyTorch>=1.7.0. Default: False.
+        class_aware_sampler (dict): Whether to use `ClassAwareSampler`
+            during training. Default: None.
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+
+    if dist:
+        # When model is :obj:`DistributedDataParallel`,
+        # `batch_size` of :obj:`dataloader` is the
+        # number of training samples on each GPU.
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        # When model is obj:`DataParallel`
+        # the batch size is samples on all the GPUS
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    if runner_type == 'IterBasedRunner':
+        # this is a batch sampler, which can yield
+        # a mini-batch indices each time.
+        # it can be used in both `DataParallel` and
+        # `DistributedDataParallel`
+        if shuffle:
+            batch_sampler = InfiniteGroupBatchSampler(
+                dataset, batch_size, world_size, rank, seed=seed)
+        else:
+            batch_sampler = InfiniteBatchSampler(
+                dataset,
+                batch_size,
+                world_size,
+                rank,
+                seed=seed,
+                shuffle=False)
+        batch_size = 1
+        sampler = None
+    else:
+        if class_aware_sampler is not None:
+            # ClassAwareSampler can be used in both distributed and
+            # non-distributed training.
+            num_sample_class = class_aware_sampler.get('num_sample_class', 1)
+            sampler = ClassAwareSampler(
+                dataset,
+                samples_per_gpu,
+                world_size,
+                rank,
+                seed=seed,
+                num_sample_class=num_sample_class)
+        elif dist:
+            # DistributedGroupSampler will definitely shuffle the data to
+            # satisfy that images on each GPU are in the same group
+            if shuffle:
+                sampler = DistributedGroupSampler(
+                    dataset, samples_per_gpu, world_size, rank, seed=seed)
+            else:
+                sampler = DistributedSampler(
+                    dataset, world_size, rank, shuffle=False, seed=seed)
+        else:
+            sampler = GroupSampler(dataset,
+                                   samples_per_gpu) if shuffle else None
+        batch_sampler = None
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    if (TORCH_VERSION != 'parrots'
+            and digit_version(TORCH_VERSION) >= digit_version('1.7.0')):
+        kwargs['persistent_workers'] = persistent_workers
+    elif persistent_workers is True:
+        warnings.warn('persistent_workers is invalid because your pytorch '
+                      'version is lower than 1.7.0')
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        batch_sampler=batch_sampler,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=kwargs.pop('pin_memory', False),
+        worker_init_fn=init_fn,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
diff --git a/mmdet/datasets/cityscapes.py b/mmdet/datasets/cityscapes.py
new file mode 100755
index 0000000..c998d12
--- /dev/null
+++ b/mmdet/datasets/cityscapes.py
@@ -0,0 +1,339 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/datasets/cityscapes.py # noqa
+# and https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+
+import glob
+import os
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+from mmcv.utils import print_log
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(CocoDataset):
+
+    CLASSES = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+               'bicycle')
+
+    PALETTE = [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
+               (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        valid_inds = []
+        # obtain images that contain annotation
+        ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.coco.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_img_ids = []
+        for i, img_info in enumerate(self.data_infos):
+            img_id = img_info['id']
+            ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+            ann_info = self.coco.loadAnns(ann_ids)
+            all_iscrowd = all([_['iscrowd'] for _ in ann_info])
+            if self.filter_empty_gt and (self.img_ids[i] not in ids_in_cat
+                                         or all_iscrowd):
+                continue
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+                valid_img_ids.append(img_id)
+        self.img_ids = valid_img_ids
+        return valid_inds
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            img_info (dict): Image info of an image.
+            ann_info (list[dict]): Annotation info of an image.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, \
+                bboxes_ignore, labels, masks, seg_map. \
+                "masks" are already decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_masks_ann.append(ann['segmentation'])
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=img_info['segm_file'])
+
+        return ann
+
+    def results2txt(self, results, outfile_prefix):
+        """Dump the detection results to a txt file.
+
+        Args:
+            results (list[list | tuple]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files.
+                If the prefix is "somepath/xxx",
+                the txt files will be named "somepath/xxx.txt".
+
+        Returns:
+            list[str]: Result txt files which contains corresponding \
+                instance segmentation images.
+        """
+        try:
+            import cityscapesscripts.helpers.labels as CSLabels
+        except ImportError:
+            raise ImportError('Please run "pip install citscapesscripts" to '
+                              'install cityscapesscripts first.')
+        result_files = []
+        os.makedirs(outfile_prefix, exist_ok=True)
+        prog_bar = mmcv.ProgressBar(len(self))
+        for idx in range(len(self)):
+            result = results[idx]
+            filename = self.data_infos[idx]['filename']
+            basename = osp.splitext(osp.basename(filename))[0]
+            pred_txt = osp.join(outfile_prefix, basename + '_pred.txt')
+
+            bbox_result, segm_result = result
+            bboxes = np.vstack(bbox_result)
+            # segm results
+            if isinstance(segm_result, tuple):
+                # Some detectors use different scores for bbox and mask,
+                # like Mask Scoring R-CNN. Score of segm will be used instead
+                # of bbox score.
+                segms = mmcv.concat_list(segm_result[0])
+                mask_score = segm_result[1]
+            else:
+                # use bbox score for mask score
+                segms = mmcv.concat_list(segm_result)
+                mask_score = [bbox[-1] for bbox in bboxes]
+            labels = [
+                np.full(bbox.shape[0], i, dtype=np.int32)
+                for i, bbox in enumerate(bbox_result)
+            ]
+            labels = np.concatenate(labels)
+
+            assert len(bboxes) == len(segms) == len(labels)
+            num_instances = len(bboxes)
+            prog_bar.update()
+            with open(pred_txt, 'w') as fout:
+                for i in range(num_instances):
+                    pred_class = labels[i]
+                    classes = self.CLASSES[pred_class]
+                    class_id = CSLabels.name2label[classes].id
+                    score = mask_score[i]
+                    mask = maskUtils.decode(segms[i]).astype(np.uint8)
+                    png_filename = osp.join(outfile_prefix,
+                                            basename + f'_{i}_{classes}.png')
+                    mmcv.imwrite(mask, png_filename)
+                    fout.write(f'{osp.basename(png_filename)} {class_id} '
+                               f'{score}\n')
+            result_files.append(pred_txt)
+
+        return result_files
+
+    def format_results(self, results, txtfile_prefix=None):
+        """Format the results to txt (standard format for Cityscapes
+        evaluation).
+
+        Args:
+            results (list): Testing results of the dataset.
+            txtfile_prefix (str | None): The prefix of txt files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving txt/png files when txtfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if txtfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            txtfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_files = self.results2txt(results, txtfile_prefix)
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 outfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        """Evaluation in Cityscapes/COCO protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            outfile_prefix (str | None): The prefix of output file. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If results are evaluated with COCO protocol, it would be the
+                prefix of output json file. For example, the metric is 'bbox'
+                and 'segm', then json files would be "a/b/prefix.bbox.json" and
+                "a/b/prefix.segm.json".
+                If results are evaluated with cityscapes protocol, it would be
+                the prefix of output txt/png files. The output files would be
+                png images under folder "a/b/prefix/xxx/" and the file name of
+                images would be written into a txt file
+                "a/b/prefix/xxx_pred.txt", where "xxx" is the video name of
+                cityscapes. If not specified, a temp file will be created.
+                Default: None.
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float]): IoU threshold used for evaluating
+                recalls. If set to a list, the average recall of all IoUs will
+                also be computed. Default: 0.5.
+
+        Returns:
+            dict[str, float]: COCO style evaluation metric or cityscapes mAP \
+                and AP@50.
+        """
+        eval_results = dict()
+
+        metrics = metric.copy() if isinstance(metric, list) else [metric]
+
+        if 'cityscapes' in metrics:
+            eval_results.update(
+                self._evaluate_cityscapes(results, outfile_prefix, logger))
+            metrics.remove('cityscapes')
+
+        # left metrics are all coco metric
+        if len(metrics) > 0:
+            # create CocoDataset with CityscapesDataset annotation
+            self_coco = CocoDataset(self.ann_file, self.pipeline.transforms,
+                                    None, self.data_root, self.img_prefix,
+                                    self.seg_prefix, self.seg_suffix,
+                                    self.proposal_file, self.test_mode,
+                                    self.filter_empty_gt)
+            # TODO: remove this in the future
+            # reload annotations of correct class
+            self_coco.CLASSES = self.CLASSES
+            self_coco.data_infos = self_coco.load_annotations(self.ann_file)
+            eval_results.update(
+                self_coco.evaluate(results, metrics, logger, outfile_prefix,
+                                   classwise, proposal_nums, iou_thrs))
+
+        return eval_results
+
+    def _evaluate_cityscapes(self, results, txtfile_prefix, logger):
+        """Evaluation in Cityscapes protocol.
+
+        Args:
+            results (list): Testing results of the dataset.
+            txtfile_prefix (str | None): The prefix of output txt file
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+
+        Returns:
+            dict[str: float]: Cityscapes evaluation results, contains 'mAP' \
+                and 'AP@50'.
+        """
+
+        try:
+            import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa
+        except ImportError:
+            raise ImportError('Please run "pip install citscapesscripts" to '
+                              'install cityscapesscripts first.')
+        msg = 'Evaluating in Cityscapes style'
+        if logger is None:
+            msg = '\n' + msg
+        print_log(msg, logger=logger)
+
+        result_files, tmp_dir = self.format_results(results, txtfile_prefix)
+
+        if tmp_dir is None:
+            result_dir = osp.join(txtfile_prefix, 'results')
+        else:
+            result_dir = osp.join(tmp_dir.name, 'results')
+
+        eval_results = OrderedDict()
+        print_log(f'Evaluating results under {result_dir} ...', logger=logger)
+
+        # set global states in cityscapes evaluation API
+        CSEval.args.cityscapesPath = os.path.join(self.img_prefix, '../..')
+        CSEval.args.predictionPath = os.path.abspath(result_dir)
+        CSEval.args.predictionWalk = None
+        CSEval.args.JSONOutput = False
+        CSEval.args.colorized = False
+        CSEval.args.gtInstancesFile = os.path.join(result_dir,
+                                                   'gtInstances.json')
+        CSEval.args.groundTruthSearch = os.path.join(
+            self.img_prefix.replace('leftImg8bit', 'gtFine'),
+            '*/*_gtFine_instanceIds.png')
+
+        groundTruthImgList = glob.glob(CSEval.args.groundTruthSearch)
+        assert len(groundTruthImgList), 'Cannot find ground truth images' \
+            f' in {CSEval.args.groundTruthSearch}.'
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(CSEval.getPrediction(gt, CSEval.args))
+        CSEval_results = CSEval.evaluateImgLists(predictionImgList,
+                                                 groundTruthImgList,
+                                                 CSEval.args)['averages']
+
+        eval_results['mAP'] = CSEval_results['allAp']
+        eval_results['AP@50'] = CSEval_results['allAp50%']
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/mmdet/datasets/coco.py b/mmdet/datasets/coco.py
new file mode 100755
index 0000000..d20a121
--- /dev/null
+++ b/mmdet/datasets/coco.py
@@ -0,0 +1,649 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import contextlib
+import io
+import itertools
+import logging
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from mmdet.core import eval_recalls
+from .api_wrappers import COCO, COCOeval
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class CocoDataset(CustomDataset):
+
+    CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+               'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+               'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+               'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+               'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+               'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+               'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+               'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+               'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+               'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+               'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+               'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
+               'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+    PALETTE = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230),
+               (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70),
+               (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0),
+               (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255),
+               (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157),
+               (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118),
+               (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182),
+               (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255),
+               (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255),
+               (134, 134, 103), (145, 148, 174), (255, 208, 186),
+               (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255),
+               (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105),
+               (166, 196, 102), (208, 195, 210), (255, 109, 65), (0, 143, 149),
+               (179, 0, 194), (209, 99, 106), (5, 121, 0), (227, 255, 205),
+               (147, 186, 208), (153, 69, 1), (3, 95, 161), (163, 255, 0),
+               (119, 0, 170), (0, 182, 199), (0, 165, 120), (183, 130, 88),
+               (95, 32, 0), (130, 114, 135), (110, 129, 133), (166, 74, 118),
+               (219, 142, 185), (79, 210, 114), (178, 90, 62), (65, 70, 15),
+               (127, 167, 115), (59, 105, 106), (142, 108, 45), (196, 172, 0),
+               (95, 54, 80), (128, 76, 255), (201, 57, 1), (246, 0, 122),
+               (191, 162, 208)]
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+
+        self.coco = COCO(ann_file)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        total_ann_ids = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            info['filename'] = info['file_name']
+            data_infos.append(info)
+            ann_ids = self.coco.get_ann_ids(img_ids=[i])
+            total_ann_ids.extend(ann_ids)
+        assert len(set(total_ann_ids)) == len(
+            total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
+        return data_infos
+
+    def get_ann_info(self, idx):
+        """Get COCO annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+        ann_info = self.coco.load_anns(ann_ids)
+        return self._parse_ann_info(self.data_infos[idx], ann_info)
+
+    def get_cat_ids(self, idx):
+        """Get COCO category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+        ann_info = self.coco.load_anns(ann_ids)
+        return [ann['category_id'] for ann in ann_info]
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        valid_inds = []
+        # obtain images that contain annotation
+        ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.coco.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_img_ids = []
+        for i, img_info in enumerate(self.data_infos):
+            img_id = self.img_ids[i]
+            if self.filter_empty_gt and img_id not in ids_in_cat:
+                continue
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+                valid_img_ids.append(img_id)
+        self.img_ids = valid_img_ids
+        return valid_inds
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,\
+                labels, masks, seg_map. "masks" are raw annotations and not \
+                decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_masks_ann.append(ann.get('segmentation', None))
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].rsplit('.', 1)[0] + self.seg_suffix
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def xyxy2xywh(self, bbox):
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def _proposal2json(self, results):
+        """Convert proposal results to COCO json style."""
+        json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            bboxes = results[idx]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = 1
+                json_results.append(data)
+        return json_results
+
+    def _det2json(self, results):
+        """Convert detection results to COCO json style."""
+        json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            result = results[idx]
+            for label in range(len(result)):
+                bboxes = result[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = self.cat_ids[label]
+                    json_results.append(data)
+        return json_results
+
+    def _segm2json(self, results):
+        """Convert instance segmentation results to COCO json style."""
+        bbox_json_results = []
+        segm_json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            det, seg = results[idx]
+            for label in range(len(det)):
+                # bbox results
+                bboxes = det[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = self.cat_ids[label]
+                    bbox_json_results.append(data)
+
+                # segm results
+                # some detectors use different scores for bbox and mask
+                if isinstance(seg, tuple):
+                    segms = seg[0][label]
+                    mask_score = seg[1][label]
+                else:
+                    segms = seg[label]
+                    mask_score = [bbox[4] for bbox in bboxes]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(mask_score[i])
+                    data['category_id'] = self.cat_ids[label]
+                    if isinstance(segms[i]['counts'], bytes):
+                        segms[i]['counts'] = segms[i]['counts'].decode()
+                    data['segmentation'] = segms[i]
+                    segm_json_results.append(data)
+        return bbox_json_results, segm_json_results
+
+    def results2json(self, results, outfile_prefix):
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (list[list | tuple | ndarray]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+                "somepath/xxx.proposal.json".
+
+        Returns:
+            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \
+                values are corresponding filenames.
+        """
+        result_files = dict()
+        if isinstance(results[0], list):
+            json_results = self._det2json(results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            mmcv.dump(json_results, result_files['bbox'])
+        elif isinstance(results[0], tuple):
+            json_results = self._segm2json(results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            mmcv.dump(json_results[0], result_files['bbox'])
+            mmcv.dump(json_results[1], result_files['segm'])
+        elif isinstance(results[0], np.ndarray):
+            json_results = self._proposal2json(results)
+            result_files['proposal'] = f'{outfile_prefix}.proposal.json'
+            mmcv.dump(json_results, result_files['proposal'])
+        else:
+            raise TypeError('invalid type of results')
+        return result_files
+
+    def fast_eval_recall(self, results, proposal_nums, iou_thrs, logger=None):
+        gt_bboxes = []
+        for i in range(len(self.img_ids)):
+            ann_ids = self.coco.get_ann_ids(img_ids=self.img_ids[i])
+            ann_info = self.coco.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                if ann.get('ignore', False) or ann['iscrowd']:
+                    continue
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, results, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_files = self.results2json(results, jsonfile_prefix)
+        return result_files, tmp_dir
+
+    def evaluate_det_segm(self,
+                          results,
+                          result_files,
+                          coco_gt,
+                          metrics,
+                          logger=None,
+                          classwise=False,
+                          proposal_nums=(100, 300, 1000),
+                          iou_thrs=None,
+                          metric_items=None):
+        """Instance segmentation and object detection evaluation in COCO
+        protocol.
+
+        Args:
+            results (list[list | tuple | dict]): Testing results of the
+                dataset.
+            result_files (dict[str, str]): a dict contains json file path.
+            coco_gt (COCO): COCO API object with ground truth annotation.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float], optional): IoU threshold used for
+                evaluating recalls/mAPs. If set to a list, the average of all
+                IoUs will also be computed. If not specified, [0.50, 0.55,
+                0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
+                Default: None.
+            metric_items (list[str] | str, optional): Metric items that will
+                be returned. If not specified, ``['AR@100', 'AR@300',
+                'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be
+                used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75',
+                'mAP_s', 'mAP_m', 'mAP_l']`` will be used when
+                ``metric=='bbox' or metric=='segm'``.
+
+        Returns:
+            dict[str, float]: COCO style evaluation metric.
+        """
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        if metric_items is not None:
+            if not isinstance(metric_items, list):
+                metric_items = [metric_items]
+
+        eval_results = OrderedDict()
+        for metric in metrics:
+            msg = f'Evaluating {metric}...'
+            if logger is None:
+                msg = '\n' + msg
+            print_log(msg, logger=logger)
+
+            if metric == 'proposal_fast':
+                if isinstance(results[0], tuple):
+                    raise KeyError('proposal_fast is not supported for '
+                                   'instance segmentation result.')
+                ar = self.fast_eval_recall(
+                    results, proposal_nums, iou_thrs, logger='silent')
+                log_msg = []
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                print_log(log_msg, logger=logger)
+                continue
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = mmcv.load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                    warnings.simplefilter('once')
+                    warnings.warn(
+                        'The key "bbox" is deleted for more accurate mask AP '
+                        'of small/medium/large instances since v2.12.0. This '
+                        'does not change the overall mAP calculation.',
+                        UserWarning)
+                coco_det = coco_gt.loadRes(predictions)
+            except IndexError:
+                print_log(
+                    'The testing results of the whole dataset is empty.',
+                    logger=logger,
+                    level=logging.ERROR)
+                break
+
+            cocoEval = COCOeval(coco_gt, coco_det, iou_type)
+            cocoEval.params.catIds = self.cat_ids
+            cocoEval.params.imgIds = self.img_ids
+            cocoEval.params.maxDets = list(proposal_nums)
+            cocoEval.params.iouThrs = iou_thrs
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item {metric_item} is not supported')
+
+            if metric == 'proposal':
+                cocoEval.params.useCats = 0
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+
+                # Save coco summarize print information to logger
+                redirect_string = io.StringIO()
+                with contextlib.redirect_stdout(redirect_string):
+                    cocoEval.summarize()
+                print_log('\n' + redirect_string.getvalue(), logger=logger)
+
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{cocoEval.stats[coco_metric_names[item]]:.4f}')
+                    eval_results[item] = val
+            else:
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+
+                # Save coco summarize print information to logger
+                redirect_string = io.StringIO()
+                with contextlib.redirect_stdout(redirect_string):
+                    cocoEval.summarize()
+                print_log('\n' + redirect_string.getvalue(), logger=logger)
+
+                if classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = cocoEval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self.coco.loadCats(catId)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    print_log('\n' + table.table, logger=logger)
+
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = float(
+                        f'{cocoEval.stats[coco_metric_names[metric_item]]:.4f}'
+                    )
+                    eval_results[key] = val
+                ap = cocoEval.stats[:6]
+                eval_results[f'{metric}_mAP_copypaste'] = (
+                    f'{ap[0]:.4f} {ap[1]:.4f} {ap[2]:.4f} {ap[3]:.4f} '
+                    f'{ap[4]:.4f} {ap[5]:.4f}')
+
+        return eval_results
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=None,
+                 metric_items=None):
+        """Evaluation in COCO protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float], optional): IoU threshold used for
+                evaluating recalls/mAPs. If set to a list, the average of all
+                IoUs will also be computed. If not specified, [0.50, 0.55,
+                0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
+                Default: None.
+            metric_items (list[str] | str, optional): Metric items that will
+                be returned. If not specified, ``['AR@100', 'AR@300',
+                'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be
+                used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75',
+                'mAP_s', 'mAP_m', 'mAP_l']`` will be used when
+                ``metric=='bbox' or metric=='segm'``.
+
+        Returns:
+            dict[str, float]: COCO style evaluation metric.
+        """
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        coco_gt = self.coco
+        self.cat_ids = coco_gt.get_cat_ids(cat_names=self.CLASSES)
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+        eval_results = self.evaluate_det_segm(results, result_files, coco_gt,
+                                              metrics, logger, classwise,
+                                              proposal_nums, iou_thrs,
+                                              metric_items)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/mmdet/datasets/coco_occluded.py b/mmdet/datasets/coco_occluded.py
new file mode 100755
index 0000000..96e439a
--- /dev/null
+++ b/mmdet/datasets/coco_occluded.py
@@ -0,0 +1,219 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmcv.fileio import load
+from mmcv.utils import print_log
+from pycocotools import mask as coco_mask
+from terminaltables import AsciiTable
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class OccludedSeparatedCocoDataset(CocoDataset):
+    """COCO dataset with evaluation on separated and occluded masks which
+    presented in paper `A Tri-Layer Plugin to Improve Occluded Detection.
+
+    <https://arxiv.org/abs/2210.10046>`_.
+
+    Separated COCO and Occluded COCO are automatically generated subsets of
+    COCO val dataset, collecting separated objects and partially occluded
+    objects for a large variety of categories. In this way, we define
+    occlusion into two major categories: separated and partially occluded.
+
+    - Separation: target object segmentation mask is separated into distinct
+      regions by the occluder.
+    - Partial Occlusion: target object is partially occluded but the
+      segmentation mask is connected.
+
+    These two new scalable real-image datasets are to benchmark a model's
+    capability to detect occluded objects of 80 common categories.
+
+    Please cite the paper if you use this dataset:
+
+    @article{zhan2022triocc,
+        title={A Tri-Layer Plugin to Improve Occluded Detection},
+        author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew},
+        journal={British Machine Vision Conference},
+        year={2022}
+    }
+
+    Args:
+        occluded_ann (str): Path to the occluded coco annotation file.
+        separated_ann (str): Path to the separated coco annotation file.
+    """  # noqa
+
+    def __init__(
+            self,
+            *args,
+            occluded_ann='https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/occluded_coco.pkl',  # noqa
+            separated_ann='https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/separated_coco.pkl',  # noqa
+            **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # load from local file
+        if osp.isfile(occluded_ann) and not osp.isabs(occluded_ann):
+            occluded_ann = osp.join(self.data_root, occluded_ann)
+        if osp.isfile(separated_ann) and not osp.isabs(separated_ann):
+            separated_ann = osp.join(self.data_root, separated_ann)
+
+        self.occluded_ann = load(occluded_ann)
+        self.separated_ann = load(separated_ann)
+
+    def evaluate(self,
+                 results,
+                 metric=[],
+                 score_thr=0.3,
+                 iou_thr=0.75,
+                 **kwargs):
+        """Occluded and separated mask evaluation in COCO protocol.
+
+        Args:
+            results (list[tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'. Defaults to [].
+            score_thr (float): Score threshold of the detection masks.
+                Defaults to 0.3.
+            iou_thr (float): IoU threshold for the recall calculation.
+                Defaults to 0.75.
+        Returns:
+            dict[str, float]: The recall of occluded and separated masks and
+            COCO style evaluation metric.
+        """
+        coco_metric_res = super().evaluate(results, metric=metric, **kwargs)
+        eval_res = self.evaluate_occluded_separated(results, score_thr,
+                                                    iou_thr)
+        coco_metric_res.update(eval_res)
+        return coco_metric_res
+
+    def evaluate_occluded_separated(self,
+                                    results,
+                                    score_thr=0.3,
+                                    iou_thr=0.75):
+        """Compute the recall of occluded and separated masks.
+
+        Args:
+            results (list[tuple]): Testing results of the dataset.
+            score_thr (float): Score threshold of the detection masks.
+                Defaults to 0.3.
+            iou_thr (float): IoU threshold for the recall calculation.
+                Defaults to 0.75.
+        Returns:
+            dict[str, float]: The recall of occluded and separated masks.
+        """
+        dict_det = {}
+        print_log('processing detection results...')
+        prog_bar = mmcv.ProgressBar(len(results))
+        for i in range(len(results)):
+            cur_img_name = self.data_infos[i]['filename']
+            if cur_img_name not in dict_det.keys():
+                dict_det[cur_img_name] = []
+            for cat_id in range(len(results[i][1])):
+                assert len(results[i][1][cat_id]) == len(results[i][0][cat_id])
+                for instance_id in range(len(results[i][1][cat_id])):
+                    cur_binary_mask = coco_mask.decode(
+                        results[i][1][cat_id][instance_id])
+                    cur_det_bbox = results[i][0][cat_id][instance_id][:4]
+                    dict_det[cur_img_name].append([
+                        results[i][0][cat_id][instance_id][4],
+                        self.CLASSES[cat_id], cur_binary_mask, cur_det_bbox
+                    ])
+            dict_det[cur_img_name].sort(
+                key=lambda x: (-x[0], x[3][0], x[3][1])
+            )  # rank by confidence from high to low, avoid same confidence
+            prog_bar.update()
+        print_log('\ncomputing occluded mask recall...')
+        occluded_correct_num, occluded_recall = self.compute_recall(
+            dict_det,
+            gt_ann=self.occluded_ann,
+            score_thr=score_thr,
+            iou_thr=iou_thr,
+            is_occ=True)
+        print_log(f'\nCOCO occluded mask recall: {occluded_recall:.2f}%')
+        print_log(f'COCO occluded mask success num: {occluded_correct_num}')
+        print_log('computing separated mask recall...')
+        separated_correct_num, separated_recall = self.compute_recall(
+            dict_det,
+            gt_ann=self.separated_ann,
+            score_thr=score_thr,
+            iou_thr=iou_thr,
+            is_occ=False)
+        print_log(f'\nCOCO separated mask recall: {separated_recall:.2f}%')
+        print_log(f'COCO separated mask success num: {separated_correct_num}')
+        table_data = [
+            ['mask type', 'recall', 'num correct'],
+            ['occluded', f'{occluded_recall:.2f}%', occluded_correct_num],
+            ['separated', f'{separated_recall:.2f}%', separated_correct_num]
+        ]
+        table = AsciiTable(table_data)
+        print_log('\n' + table.table)
+        return dict(
+            occluded_recall=occluded_recall, separated_recall=separated_recall)
+
+    def compute_recall(self,
+                       result_dict,
+                       gt_ann,
+                       score_thr=0.3,
+                       iou_thr=0.75,
+                       is_occ=True):
+        """Compute the recall of occluded or separated masks.
+
+        Args:
+            results (list[tuple]): Testing results of the dataset.
+            gt_ann (list): Occluded or separated coco annotations.
+            score_thr (float): Score threshold of the detection masks.
+                Defaults to 0.3.
+            iou_thr (float): IoU threshold for the recall calculation.
+                Defaults to 0.75.
+            is_occ (bool): Whether the annotation is occluded mask.
+                Defaults to True.
+        Returns:
+            tuple: number of correct masks and the recall.
+        """
+        correct = 0
+        prog_bar = mmcv.ProgressBar(len(gt_ann))
+        for iter_i in range(len(gt_ann)):
+            cur_item = gt_ann[iter_i]
+            cur_img_name = cur_item[0]
+            cur_gt_bbox = cur_item[3]
+            if is_occ:
+                cur_gt_bbox = [
+                    cur_gt_bbox[0], cur_gt_bbox[1],
+                    cur_gt_bbox[0] + cur_gt_bbox[2],
+                    cur_gt_bbox[1] + cur_gt_bbox[3]
+                ]
+            cur_gt_class = cur_item[1]
+            cur_gt_mask = coco_mask.decode(cur_item[4])
+
+            assert cur_img_name in result_dict.keys()
+            cur_detections = result_dict[cur_img_name]
+
+            correct_flag = False
+            for i in range(len(cur_detections)):
+                cur_det_confidence = cur_detections[i][0]
+                if cur_det_confidence < score_thr:
+                    break
+                cur_det_class = cur_detections[i][1]
+                if cur_det_class != cur_gt_class:
+                    continue
+                cur_det_mask = cur_detections[i][2]
+                cur_iou = self.mask_iou(cur_det_mask, cur_gt_mask)
+                if cur_iou >= iou_thr:
+                    correct_flag = True
+                    break
+            if correct_flag:
+                correct += 1
+            prog_bar.update()
+        recall = correct / len(gt_ann) * 100
+        return correct, recall
+
+    def mask_iou(self, mask1, mask2):
+        """Compute IoU between two masks."""
+        mask1_area = np.count_nonzero(mask1 == 1)
+        mask2_area = np.count_nonzero(mask2 == 1)
+        intersection = np.count_nonzero(np.logical_and(mask1 == 1, mask2 == 1))
+        iou = intersection / (mask1_area + mask2_area - intersection)
+        return iou
diff --git a/mmdet/datasets/coco_panoptic.py b/mmdet/datasets/coco_panoptic.py
new file mode 100755
index 0000000..53ef594
--- /dev/null
+++ b/mmdet/datasets/coco_panoptic.py
@@ -0,0 +1,692 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import os
+from collections import defaultdict
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from mmdet.core import INSTANCE_OFFSET
+from .api_wrappers import COCO, pq_compute_multi_core
+from .builder import DATASETS
+from .coco import CocoDataset
+
+try:
+    import panopticapi
+    from panopticapi.evaluation import VOID
+    from panopticapi.utils import id2rgb
+except ImportError:
+    panopticapi = None
+    id2rgb = None
+    VOID = None
+
+__all__ = ['CocoPanopticDataset']
+
+
+class COCOPanoptic(COCO):
+    """This wrapper is for loading the panoptic style annotation file.
+
+    The format is shown in the CocoPanopticDataset class.
+
+    Args:
+        annotation_file (str): Path of annotation file.
+    """
+
+    def __init__(self, annotation_file=None):
+        if panopticapi is None:
+            raise RuntimeError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+
+        super(COCOPanoptic, self).__init__(annotation_file)
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        # anns stores 'segment_id -> annotation'
+        anns, cats, imgs = {}, {}, {}
+        img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann, img_info in zip(self.dataset['annotations'],
+                                     self.dataset['images']):
+                img_info['segm_file'] = ann['file_name']
+                for seg_ann in ann['segments_info']:
+                    # to match with instance.json
+                    seg_ann['image_id'] = ann['image_id']
+                    seg_ann['height'] = img_info['height']
+                    seg_ann['width'] = img_info['width']
+                    img_to_anns[ann['image_id']].append(seg_ann)
+                    # segment_id is not unique in coco dataset orz...
+                    if seg_ann['id'] in anns.keys():
+                        anns[seg_ann['id']].append(seg_ann)
+                    else:
+                        anns[seg_ann['id']] = [seg_ann]
+
+        if 'images' in self.dataset:
+            for img in self.dataset['images']:
+                imgs[img['id']] = img
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    cat_to_imgs[seg_ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        self.anns = anns
+        self.imgToAnns = img_to_anns
+        self.catToImgs = cat_to_imgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def load_anns(self, ids=[]):
+        """Load anns with the specified ids.
+
+        self.anns is a list of annotation lists instead of a
+        list of annotations.
+
+        Args:
+            ids (int array): integer ids specifying anns
+
+        Returns:
+            anns (object array): loaded ann objects
+        """
+        anns = []
+
+        if hasattr(ids, '__iter__') and hasattr(ids, '__len__'):
+            # self.anns is a list of annotation lists instead of
+            # a list of annotations
+            for id in ids:
+                anns += self.anns[id]
+            return anns
+        elif type(ids) == int:
+            return self.anns[ids]
+
+
+@DATASETS.register_module()
+class CocoPanopticDataset(CocoDataset):
+    """Coco dataset for Panoptic segmentation.
+
+    The annotation format is shown as follows. The `ann` field is optional
+    for testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': f'{image_id:012}.png',
+                'image_id':9
+                'segments_info': {
+                    [
+                        {
+                            'id': 8345037, (segment_id in panoptic png,
+                                            convert from rgb)
+                            'category_id': 51,
+                            'iscrowd': 0,
+                            'bbox': (x1, y1, w, h),
+                            'area': 24315,
+                            'segmentation': list,(coded mask)
+                        },
+                        ...
+                    }
+                }
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Panoptic segmentation annotation file path.
+        pipeline (list[dict]): Processing pipeline.
+        ins_ann_file (str): Instance segmentation annotation file path.
+            Defaults to None.
+        classes (str | Sequence[str], optional): Specify classes to load.
+            If is None, ``cls.CLASSES`` will be used. Defaults to None.
+        data_root (str, optional): Data root for ``ann_file``,
+            ``ins_ann_file`` ``img_prefix``, ``seg_prefix``, ``proposal_file``
+            if specified. Defaults to None.
+        img_prefix (str, optional): Prefix of path to images. Defaults to ''.
+        seg_prefix (str, optional): Prefix of path to segmentation files.
+            Defaults to None.
+        proposal_file (str, optional): Path to proposal file. Defaults to None.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+            Defaults to False.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes of the dataset's classes will be filtered out. This option
+            only works when `test_mode=False`, i.e., we never filter images
+            during tests. Defaults to True.
+        file_client_args (:obj:`mmcv.ConfigDict` | dict): file client args.
+            Defaults to dict(backend='disk').
+    """
+    CLASSES = [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        ' truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff',
+        'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light',
+        'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+        'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+        'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+        'wall-wood', 'water-other', 'window-blind', 'window-other',
+        'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+        'cabinet-merged', 'table-merged', 'floor-other-merged',
+        'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+        'paper-merged', 'food-other-merged', 'building-other-merged',
+        'rock-merged', 'wall-other-merged', 'rug-merged'
+    ]
+    THING_CLASSES = [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+    ]
+    STUFF_CLASSES = [
+        'banner', 'blanket', 'bridge', 'cardboard', 'counter', 'curtain',
+        'door-stuff', 'floor-wood', 'flower', 'fruit', 'gravel', 'house',
+        'light', 'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+        'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+        'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+        'wall-wood', 'water-other', 'window-blind', 'window-other',
+        'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+        'cabinet-merged', 'table-merged', 'floor-other-merged',
+        'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+        'paper-merged', 'food-other-merged', 'building-other-merged',
+        'rock-merged', 'wall-other-merged', 'rug-merged'
+    ]
+
+    PALETTE = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230),
+               (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70),
+               (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0),
+               (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255),
+               (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157),
+               (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118),
+               (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182),
+               (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255),
+               (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255),
+               (134, 134, 103), (145, 148, 174), (255, 208, 186),
+               (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255),
+               (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105),
+               (166, 196, 102), (208, 195, 210), (255, 109, 65), (0, 143, 149),
+               (179, 0, 194), (209, 99, 106), (5, 121, 0), (227, 255, 205),
+               (147, 186, 208), (153, 69, 1), (3, 95, 161), (163, 255, 0),
+               (119, 0, 170), (0, 182, 199), (0, 165, 120), (183, 130, 88),
+               (95, 32, 0), (130, 114, 135), (110, 129, 133), (166, 74, 118),
+               (219, 142, 185), (79, 210, 114), (178, 90, 62), (65, 70, 15),
+               (127, 167, 115), (59, 105, 106), (142, 108, 45), (196, 172, 0),
+               (95, 54, 80), (128, 76, 255), (201, 57, 1), (246, 0, 122),
+               (191, 162, 208), (255, 255, 128), (147, 211, 203),
+               (150, 100, 100), (168, 171, 172), (146, 112, 198),
+               (210, 170, 100), (92, 136, 89), (218, 88, 184), (241, 129, 0),
+               (217, 17, 255), (124, 74, 181), (70, 70, 70), (255, 228, 255),
+               (154, 208, 0), (193, 0, 92), (76, 91, 113), (255, 180, 195),
+               (106, 154, 176),
+               (230, 150, 140), (60, 143, 255), (128, 64, 128), (92, 82, 55),
+               (254, 212, 124), (73, 77, 174), (255, 160, 98), (255, 255, 255),
+               (104, 84, 109), (169, 164, 131), (225, 199, 255), (137, 54, 74),
+               (135, 158, 223), (7, 246, 231), (107, 255, 200), (58, 41, 149),
+               (183, 121, 142), (255, 73, 97), (107, 142, 35), (190, 153, 153),
+               (146, 139, 141),
+               (70, 130, 180), (134, 199, 156), (209, 226, 140), (96, 36, 108),
+               (96, 96, 96), (64, 170, 64), (152, 251, 152), (208, 229, 228),
+               (206, 186, 171), (152, 161, 64), (116, 112, 0), (0, 114, 143),
+               (102, 102, 156), (250, 141, 255)]
+
+    def __init__(self,
+                 ann_file,
+                 pipeline,
+                 ins_ann_file=None,
+                 classes=None,
+                 data_root=None,
+                 img_prefix='',
+                 seg_prefix=None,
+                 proposal_file=None,
+                 test_mode=False,
+                 filter_empty_gt=True,
+                 file_client_args=dict(backend='disk')):
+        super().__init__(
+            ann_file,
+            pipeline,
+            classes=classes,
+            data_root=data_root,
+            img_prefix=img_prefix,
+            seg_prefix=seg_prefix,
+            proposal_file=proposal_file,
+            test_mode=test_mode,
+            filter_empty_gt=filter_empty_gt,
+            file_client_args=file_client_args)
+        self.ins_ann_file = ins_ann_file
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO Panoptic style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+        self.coco = COCOPanoptic(ann_file)
+        self.cat_ids = self.coco.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.categories = self.coco.cats
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            info['filename'] = info['file_name']
+            info['segm_file'] = info['filename'].replace('jpg', 'png')
+            data_infos.append(info)
+        return data_infos
+
+    def get_ann_info(self, idx):
+        """Get COCO annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+        img_id = self.data_infos[idx]['id']
+        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+        ann_info = self.coco.load_anns(ann_ids)
+        # filter out unmatched images
+        ann_info = [i for i in ann_info if i['image_id'] == img_id]
+        return self._parse_ann_info(self.data_infos[idx], ann_info)
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse annotations and load panoptic ground truths.
+
+        Args:
+            img_info (int): Image info of an image.
+            ann_info (list[dict]): Annotation info of an image.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,
+                labels, masks, seg_map.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_mask_infos = []
+
+        for i, ann in enumerate(ann_info):
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            category_id = ann['category_id']
+            contiguous_cat_id = self.cat2label[category_id]
+
+            is_thing = self.coco.load_cats(ids=category_id)[0]['isthing']
+            if is_thing:
+                is_crowd = ann.get('iscrowd', False)
+                if not is_crowd:
+                    gt_bboxes.append(bbox)
+                    gt_labels.append(contiguous_cat_id)
+                else:
+                    gt_bboxes_ignore.append(bbox)
+                    is_thing = False
+
+            mask_info = {
+                'id': ann['id'],
+                'category': contiguous_cat_id,
+                'is_thing': is_thing
+            }
+            gt_mask_infos.append(mask_info)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_mask_infos,
+            seg_map=img_info['segm_file'])
+
+        return ann
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        ids_with_ann = []
+        # check whether images have legal thing annotations.
+        for lists in self.coco.anns.values():
+            for item in lists:
+                category_id = item['category_id']
+                is_thing = self.coco.load_cats(ids=category_id)[0]['isthing']
+                if not is_thing:
+                    continue
+                ids_with_ann.append(item['image_id'])
+        ids_with_ann = set(ids_with_ann)
+
+        valid_inds = []
+        valid_img_ids = []
+        for i, img_info in enumerate(self.data_infos):
+            img_id = self.img_ids[i]
+            if self.filter_empty_gt and img_id not in ids_with_ann:
+                continue
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+                valid_img_ids.append(img_id)
+        self.img_ids = valid_img_ids
+        return valid_inds
+
+    def _pan2json(self, results, outfile_prefix):
+        """Convert panoptic results to COCO panoptic json style."""
+        label2cat = dict((v, k) for (k, v) in self.cat2label.items())
+        pred_annotations = []
+        outdir = os.path.join(os.path.dirname(outfile_prefix), 'panoptic')
+
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            segm_file = self.data_infos[idx]['segm_file']
+            pan = results[idx]
+
+            pan_labels = np.unique(pan)
+            segm_info = []
+            for pan_label in pan_labels:
+                sem_label = pan_label % INSTANCE_OFFSET
+                # We reserve the length of self.CLASSES for VOID label
+                if sem_label == len(self.CLASSES):
+                    continue
+                # convert sem_label to json label
+                cat_id = label2cat[sem_label]
+                is_thing = self.categories[cat_id]['isthing']
+                mask = pan == pan_label
+                area = mask.sum()
+                segm_info.append({
+                    'id': int(pan_label),
+                    'category_id': cat_id,
+                    'isthing': is_thing,
+                    'area': int(area)
+                })
+            # evaluation script uses 0 for VOID label.
+            pan[pan % INSTANCE_OFFSET == len(self.CLASSES)] = VOID
+            pan = id2rgb(pan).astype(np.uint8)
+            mmcv.imwrite(pan[:, :, ::-1], os.path.join(outdir, segm_file))
+            record = {
+                'image_id': img_id,
+                'segments_info': segm_info,
+                'file_name': segm_file
+            }
+            pred_annotations.append(record)
+        pan_json_results = dict(annotations=pred_annotations)
+        return pan_json_results
+
+    def results2json(self, results, outfile_prefix):
+        """Dump the results to a COCO style json file.
+
+        There are 4 types of results: proposals, bbox predictions, mask
+        predictions, panoptic segmentation predictions, and they have
+        different data types. This method will automatically recognize
+        the type, and dump them to json files.
+
+        .. code-block:: none
+
+            [
+                {
+                    'pan_results': np.array, # shape (h, w)
+                    # ins_results which includes bboxes and RLE encoded masks
+                    # is optional.
+                    'ins_results': (list[np.array], list[list[str]])
+                },
+                ...
+            ]
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.panoptic.json", "somepath/xxx.bbox.json",
+                "somepath/xxx.segm.json"
+
+        Returns:
+            dict[str: str]: Possible keys are "panoptic", "bbox", "segm", \
+                "proposal", and values are corresponding filenames.
+        """
+        result_files = dict()
+        # panoptic segmentation results
+        if 'pan_results' in results[0]:
+            pan_results = [result['pan_results'] for result in results]
+            pan_json_results = self._pan2json(pan_results, outfile_prefix)
+            result_files['panoptic'] = f'{outfile_prefix}.panoptic.json'
+            mmcv.dump(pan_json_results, result_files['panoptic'])
+
+        # instance segmentation results
+        if 'ins_results' in results[0]:
+            ins_results = [result['ins_results'] for result in results]
+            bbox_json_results, segm_json_results = self._segm2json(ins_results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            mmcv.dump(bbox_json_results, result_files['bbox'])
+            mmcv.dump(segm_json_results, result_files['segm'])
+
+        return result_files
+
+    def evaluate_pan_json(self,
+                          result_files,
+                          outfile_prefix,
+                          logger=None,
+                          classwise=False,
+                          nproc=32):
+        """Evaluate PQ according to the panoptic results json file."""
+        imgs = self.coco.imgs
+        gt_json = self.coco.img_ann_map  # image to annotations
+        gt_json = [{
+            'image_id': k,
+            'segments_info': v,
+            'file_name': imgs[k]['segm_file']
+        } for k, v in gt_json.items()]
+        pred_json = mmcv.load(result_files['panoptic'])
+        pred_json = dict(
+            (el['image_id'], el) for el in pred_json['annotations'])
+
+        # match the gt_anns and pred_anns in the same image
+        matched_annotations_list = []
+        for gt_ann in gt_json:
+            img_id = gt_ann['image_id']
+            if img_id not in pred_json.keys():
+                raise Exception('no prediction for the image'
+                                ' with id: {}'.format(img_id))
+            matched_annotations_list.append((gt_ann, pred_json[img_id]))
+
+        gt_folder = self.seg_prefix
+        pred_folder = os.path.join(os.path.dirname(outfile_prefix), 'panoptic')
+
+        pq_stat = pq_compute_multi_core(
+            matched_annotations_list,
+            gt_folder,
+            pred_folder,
+            self.categories,
+            self.file_client,
+            nproc=nproc)
+
+        metrics = [('All', None), ('Things', True), ('Stuff', False)]
+        pq_results = {}
+
+        for name, isthing in metrics:
+            pq_results[name], classwise_results = pq_stat.pq_average(
+                self.categories, isthing=isthing)
+            if name == 'All':
+                pq_results['classwise'] = classwise_results
+
+        classwise_results = None
+        if classwise:
+            classwise_results = {
+                k: v
+                for k, v in zip(self.CLASSES, pq_results['classwise'].values())
+            }
+        print_panoptic_table(pq_results, classwise_results, logger=logger)
+        results = parse_pq_results(pq_results)
+        results['PQ_copypaste'] = (
+            f'{results["PQ"]:.3f} {results["SQ"]:.3f} '
+            f'{results["RQ"]:.3f} '
+            f'{results["PQ_th"]:.3f} {results["SQ_th"]:.3f} '
+            f'{results["RQ_th"]:.3f} '
+            f'{results["PQ_st"]:.3f} {results["SQ_st"]:.3f} '
+            f'{results["RQ_st"]:.3f}')
+
+        return results
+
+    def evaluate(self,
+                 results,
+                 metric='PQ',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 classwise=False,
+                 nproc=32,
+                 **kwargs):
+        """Evaluation in COCO Panoptic protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. 'PQ', 'bbox',
+                'segm', 'proposal' are supported. 'pq' will be regarded as 'PQ.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            classwise (bool): Whether to print classwise evaluation results.
+                Default: False.
+            nproc (int): Number of processes for panoptic quality computing.
+                Defaults to 32. When `nproc` exceeds the number of cpu cores,
+                the number of cpu cores is used.
+
+        Returns:
+            dict[str, float]: COCO Panoptic style evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        # Compatible with lowercase 'pq'
+        metrics = ['PQ' if metric == 'pq' else metric for metric in metrics]
+        allowed_metrics = ['PQ', 'bbox', 'segm', 'proposal']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+        eval_results = {}
+
+        outfile_prefix = os.path.join(tmp_dir.name, 'results') \
+            if tmp_dir is not None else jsonfile_prefix
+        if 'PQ' in metrics:
+            eval_pan_results = self.evaluate_pan_json(
+                result_files, outfile_prefix, logger, classwise, nproc=nproc)
+
+            eval_results.update(eval_pan_results)
+            metrics.remove('PQ')
+
+        if (('bbox' in metrics) or ('segm' in metrics)
+                or ('proposal' in metrics)):
+
+            assert 'ins_results' in results[0], 'instance segmentation' \
+                'results are absent from results'
+
+            assert self.ins_ann_file is not None, 'Annotation '\
+                'file for instance segmentation or object detection ' \
+                'shuold not be None'
+
+            coco_gt = COCO(self.ins_ann_file)
+            panoptic_cat_ids = self.cat_ids
+            self.cat_ids = coco_gt.get_cat_ids(cat_names=self.THING_CLASSES)
+
+            eval_ins_results = self.evaluate_det_segm(results, result_files,
+                                                      coco_gt, metrics, logger,
+                                                      classwise, **kwargs)
+            self.cat_ids = panoptic_cat_ids
+            eval_results.update(eval_ins_results)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
+
+
+def parse_pq_results(pq_results):
+    """Parse the Panoptic Quality results."""
+    result = dict()
+    result['PQ'] = 100 * pq_results['All']['pq']
+    result['SQ'] = 100 * pq_results['All']['sq']
+    result['RQ'] = 100 * pq_results['All']['rq']
+    result['PQ_th'] = 100 * pq_results['Things']['pq']
+    result['SQ_th'] = 100 * pq_results['Things']['sq']
+    result['RQ_th'] = 100 * pq_results['Things']['rq']
+    result['PQ_st'] = 100 * pq_results['Stuff']['pq']
+    result['SQ_st'] = 100 * pq_results['Stuff']['sq']
+    result['RQ_st'] = 100 * pq_results['Stuff']['rq']
+    return result
+
+
+def print_panoptic_table(pq_results, classwise_results=None, logger=None):
+    """Print the panoptic evaluation results table.
+
+    Args:
+        pq_results(dict): The Panoptic Quality results.
+        classwise_results(dict | None): The classwise Panoptic Quality results.
+            The keys are class names and the values are metrics.
+        logger (logging.Logger | str | None): Logger used for printing
+            related information during evaluation. Default: None.
+    """
+
+    headers = ['', 'PQ', 'SQ', 'RQ', 'categories']
+    data = [headers]
+    for name in ['All', 'Things', 'Stuff']:
+        numbers = [
+            f'{(pq_results[name][k] * 100):0.3f}' for k in ['pq', 'sq', 'rq']
+        ]
+        row = [name] + numbers + [pq_results[name]['n']]
+        data.append(row)
+    table = AsciiTable(data)
+    print_log('Panoptic Evaluation Results:\n' + table.table, logger=logger)
+
+    if classwise_results is not None:
+        class_metrics = [(name, ) + tuple(f'{(metrics[k] * 100):0.3f}'
+                                          for k in ['pq', 'sq', 'rq'])
+                         for name, metrics in classwise_results.items()]
+        num_columns = min(8, len(class_metrics) * 4)
+        results_flatten = list(itertools.chain(*class_metrics))
+        headers = ['category', 'PQ', 'SQ', 'RQ'] * (num_columns // 4)
+        results_2d = itertools.zip_longest(
+            *[results_flatten[i::num_columns] for i in range(num_columns)])
+        data = [headers]
+        data += [result for result in results_2d]
+        table = AsciiTable(data)
+        print_log(
+            'Classwise Panoptic Evaluation Results:\n' + table.table,
+            logger=logger)
diff --git a/mmdet/datasets/custom.py b/mmdet/datasets/custom.py
new file mode 100755
index 0000000..3b97685
--- /dev/null
+++ b/mmdet/datasets/custom.py
@@ -0,0 +1,412 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+from torch.utils.data import Dataset
+
+from mmdet.core import eval_map, eval_recalls
+from .builder import DATASETS
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class CustomDataset(Dataset):
+    """Custom dataset for detection.
+
+    The annotation format is shown as follows. The `ann` field is optional for
+    testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': 'a.jpg',
+                'width': 1280,
+                'height': 720,
+                'ann': {
+                    'bboxes': <np.ndarray> (n, 4) in (x1, y1, x2, y2) order.
+                    'labels': <np.ndarray> (n, ),
+                    'bboxes_ignore': <np.ndarray> (k, 4), (optional field)
+                    'labels_ignore': <np.ndarray> (k, 4) (optional field)
+                }
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Annotation file path.
+        pipeline (list[dict]): Processing pipeline.
+        classes (str | Sequence[str], optional): Specify classes to load.
+            If is None, ``cls.CLASSES`` will be used. Default: None.
+        data_root (str, optional): Data root for ``ann_file``,
+            ``img_prefix``, ``seg_prefix``, ``proposal_file`` if specified.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes of the dataset's classes will be filtered out. This option
+            only works when `test_mode=False`, i.e., we never filter images
+            during tests.
+    """
+
+    CLASSES = None
+
+    PALETTE = None
+
+    def __init__(self,
+                 ann_file,
+                 pipeline,
+                 classes=None,
+                 data_root=None,
+                 img_prefix='',
+                 seg_prefix=None,
+                 seg_suffix='.png',
+                 proposal_file=None,
+                 test_mode=False,
+                 filter_empty_gt=True,
+                 file_client_args=dict(backend='disk')):
+        self.ann_file = ann_file
+        self.data_root = data_root
+        self.img_prefix = img_prefix
+        self.seg_prefix = seg_prefix
+        self.seg_suffix = seg_suffix
+        self.proposal_file = proposal_file
+        self.test_mode = test_mode
+        self.filter_empty_gt = filter_empty_gt
+        self.file_client = mmcv.FileClient(**file_client_args)
+        self.CLASSES = self.get_classes(classes)
+
+        # join paths if data_root is specified
+        if self.data_root is not None:
+            if not osp.isabs(self.ann_file):
+                self.ann_file = osp.join(self.data_root, self.ann_file)
+            if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
+                self.img_prefix = osp.join(self.data_root, self.img_prefix)
+            if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
+                self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
+            if not (self.proposal_file is None
+                    or osp.isabs(self.proposal_file)):
+                self.proposal_file = osp.join(self.data_root,
+                                              self.proposal_file)
+        # load annotations (and proposals)
+        if hasattr(self.file_client, 'get_local_path'):
+            with self.file_client.get_local_path(self.ann_file) as local_path:
+                self.data_infos = self.load_annotations(local_path)
+        else:
+            warnings.warn(
+                'The used MMCV version does not have get_local_path. '
+                f'We treat the {self.ann_file} as local paths and it '
+                'might cause errors if the path is not a local path. '
+                'Please use MMCV>= 1.3.16 if you meet errors.')
+            self.data_infos = self.load_annotations(self.ann_file)
+
+        if self.proposal_file is not None:
+            if hasattr(self.file_client, 'get_local_path'):
+                with self.file_client.get_local_path(
+                        self.proposal_file) as local_path:
+                    self.proposals = self.load_proposals(local_path)
+            else:
+                warnings.warn(
+                    'The used MMCV version does not have get_local_path. '
+                    f'We treat the {self.ann_file} as local paths and it '
+                    'might cause errors if the path is not a local path. '
+                    'Please use MMCV>= 1.3.16 if you meet errors.')
+                self.proposals = self.load_proposals(self.proposal_file)
+        else:
+            self.proposals = None
+
+        # filter images too small and containing no annotations
+        if not test_mode:
+            valid_inds = self._filter_imgs()
+            self.data_infos = [self.data_infos[i] for i in valid_inds]
+            if self.proposals is not None:
+                self.proposals = [self.proposals[i] for i in valid_inds]
+            # set group flag for the sampler
+            self._set_group_flag()
+
+        # processing pipeline
+        self.pipeline = Compose(pipeline)
+
+    def __len__(self):
+        """Total number of samples of data."""
+        return len(self.data_infos)
+
+    def load_annotations(self, ann_file):
+        """Load annotation from annotation file."""
+        return mmcv.load(ann_file)
+
+    def load_proposals(self, proposal_file):
+        """Load proposal from proposal file."""
+        return mmcv.load(proposal_file)
+
+    def get_ann_info(self, idx):
+        """Get annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        return self.data_infos[idx]['ann']
+
+    def get_cat_ids(self, idx):
+        """Get category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        return self.data_infos[idx]['ann']['labels'].astype(np.int).tolist()
+
+    def pre_pipeline(self, results):
+        """Prepare results dict for pipeline."""
+        results['img_prefix'] = self.img_prefix
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small."""
+        if self.filter_empty_gt:
+            warnings.warn(
+                'CustomDataset does not support filtering empty gt images.')
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+        return valid_inds
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+        for i in range(len(self)):
+            img_info = self.data_infos[i]
+            if img_info['width'] / img_info['height'] > 1:
+                self.flag[i] = 1
+
+    def _rand_another(self, idx):
+        """Get another random index from the same group as the given index."""
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        """Get training/test data after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training/test data (with annotation if `test_mode` is set \
+                True).
+        """
+
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+        while True:
+            data = self.prepare_train_img(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def prepare_train_img(self, idx):
+        """Get training data and annotations after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training data and annotation after pipeline with new keys \
+                introduced by pipeline.
+        """
+
+        img_info = self.data_infos[idx]
+        ann_info = self.get_ann_info(idx)
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Get testing data after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Testing data after pipeline with new keys introduced by \
+                pipeline.
+        """
+
+        img_info = self.data_infos[idx]
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    @classmethod
+    def get_classes(cls, classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Returns:
+            tuple[str] or list[str]: Names of categories of the dataset.
+        """
+        if classes is None:
+            return cls.CLASSES
+
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+        return class_names
+
+    def get_cat2imgs(self):
+        """Get a dict with class as key and img_ids as values, which will be
+        used in :class:`ClassAwareSampler`.
+
+        Returns:
+            dict[list]: A dict of per-label image list,
+            the item of the dict indicates a label index,
+            corresponds to the image index that contains the label.
+        """
+        if self.CLASSES is None:
+            raise ValueError('self.CLASSES can not be None')
+        # sort the label index
+        cat2imgs = {i: [] for i in range(len(self.CLASSES))}
+        for i in range(len(self)):
+            cat_ids = set(self.get_cat_ids(i))
+            for cat in cat_ids:
+                cat2imgs[cat].append(i)
+        return cat2imgs
+
+    def format_results(self, results, **kwargs):
+        """Place holder to format result to dataset specific output."""
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thr=0.5,
+                 scale_ranges=None):
+        """Evaluate the dataset.
+
+        Args:
+            results (list): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | None | str): Logger used for printing
+                related information during evaluation. Default: None.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thr (float | list[float]): IoU threshold. Default: 0.5.
+            scale_ranges (list[tuple] | None): Scale ranges for evaluating mAP.
+                Default: None.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP', 'recall']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+        eval_results = OrderedDict()
+        iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+        if metric == 'mAP':
+            assert isinstance(iou_thrs, list)
+            mean_aps = []
+            for iou_thr in iou_thrs:
+                print_log(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+                mean_ap, _ = eval_map(
+                    results,
+                    annotations,
+                    scale_ranges=scale_ranges,
+                    iou_thr=iou_thr,
+                    dataset=self.CLASSES,
+                    logger=logger)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+        elif metric == 'recall':
+            gt_bboxes = [ann['bboxes'] for ann in annotations]
+            recalls = eval_recalls(
+                gt_bboxes, results, proposal_nums, iou_thr, logger=logger)
+            for i, num in enumerate(proposal_nums):
+                for j, iou in enumerate(iou_thrs):
+                    eval_results[f'recall@{num}@{iou}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
+
+    def __repr__(self):
+        """Print the number of instance number."""
+        dataset_type = 'Test' if self.test_mode else 'Train'
+        result = (f'\n{self.__class__.__name__} {dataset_type} dataset '
+                  f'with number of images {len(self)}, '
+                  f'and instance counts: \n')
+        if self.CLASSES is None:
+            result += 'Category names are not provided. \n'
+            return result
+        instance_count = np.zeros(len(self.CLASSES) + 1).astype(int)
+        # count the instance number in each image
+        for idx in range(len(self)):
+            label = self.get_ann_info(idx)['labels']
+            unique, counts = np.unique(label, return_counts=True)
+            if len(unique) > 0:
+                # add the occurrence number to each class
+                instance_count[unique] += counts
+            else:
+                # background is the last index
+                instance_count[-1] += 1
+        # create a table with category count
+        table_data = [['category', 'count'] * 5]
+        row_data = []
+        for cls, count in enumerate(instance_count):
+            if cls < len(self.CLASSES):
+                row_data += [f'{cls} [{self.CLASSES[cls]}]', f'{count}']
+            else:
+                # add the background number
+                row_data += ['-1 background', f'{count}']
+            if len(row_data) == 10:
+                table_data.append(row_data)
+                row_data = []
+        if len(row_data) >= 2:
+            if row_data[-1] == '0':
+                row_data = row_data[:-2]
+            if len(row_data) >= 2:
+                table_data.append([])
+                table_data.append(row_data)
+
+        table = AsciiTable(table_data)
+        result += table.table
+        return result
diff --git a/mmdet/datasets/dataset_wrappers.py b/mmdet/datasets/dataset_wrappers.py
new file mode 100755
index 0000000..d6ceffb
--- /dev/null
+++ b/mmdet/datasets/dataset_wrappers.py
@@ -0,0 +1,456 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import collections
+import copy
+import math
+from collections import defaultdict
+
+import numpy as np
+from mmcv.utils import build_from_cfg, print_log
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from .builder import DATASETS, PIPELINES
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    concat the group flag for image aspect ratio.
+
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+        separate_eval (bool): Whether to evaluate the results
+            separately if it is used as validation dataset.
+            Defaults to True.
+    """
+
+    def __init__(self, datasets, separate_eval=True):
+        super(ConcatDataset, self).__init__(datasets)
+        self.CLASSES = datasets[0].CLASSES
+        self.PALETTE = getattr(datasets[0], 'PALETTE', None)
+        self.separate_eval = separate_eval
+        if not separate_eval:
+            if any([isinstance(ds, CocoDataset) for ds in datasets]):
+                raise NotImplementedError(
+                    'Evaluating concatenated CocoDataset as a whole is not'
+                    ' supported! Please set "separate_eval=True"')
+            elif len(set([type(ds) for ds in datasets])) != 1:
+                raise NotImplementedError(
+                    'All the datasets should have same types')
+
+        if hasattr(datasets[0], 'flag'):
+            flags = []
+            for i in range(0, len(datasets)):
+                flags.append(datasets[i].flag)
+            self.flag = np.concatenate(flags)
+
+    def get_cat_ids(self, idx):
+        """Get category ids of concatenated dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    'absolute value of index should not exceed dataset length')
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].get_cat_ids(sample_idx)
+
+    def get_ann_info(self, idx):
+        """Get annotation of concatenated dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    'absolute value of index should not exceed dataset length')
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].get_ann_info(sample_idx)
+
+    def evaluate(self, results, logger=None, **kwargs):
+        """Evaluate the results.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+
+        Returns:
+            dict[str: float]: AP results of the total dataset or each separate
+            dataset if `self.separate_eval=True`.
+        """
+        assert len(results) == self.cumulative_sizes[-1], \
+            ('Dataset and results have different sizes: '
+             f'{self.cumulative_sizes[-1]} v.s. {len(results)}')
+
+        # Check whether all the datasets support evaluation
+        for dataset in self.datasets:
+            assert hasattr(dataset, 'evaluate'), \
+                f'{type(dataset)} does not implement evaluate function'
+
+        if self.separate_eval:
+            dataset_idx = -1
+            total_eval_results = dict()
+            for size, dataset in zip(self.cumulative_sizes, self.datasets):
+                start_idx = 0 if dataset_idx == -1 else \
+                    self.cumulative_sizes[dataset_idx]
+                end_idx = self.cumulative_sizes[dataset_idx + 1]
+
+                results_per_dataset = results[start_idx:end_idx]
+                print_log(
+                    f'\nEvaluating {dataset.ann_file} with '
+                    f'{len(results_per_dataset)} images now',
+                    logger=logger)
+
+                eval_results_per_dataset = dataset.evaluate(
+                    results_per_dataset, logger=logger, **kwargs)
+                dataset_idx += 1
+                for k, v in eval_results_per_dataset.items():
+                    total_eval_results.update({f'{dataset_idx}_{k}': v})
+
+            return total_eval_results
+        elif any([isinstance(ds, CocoDataset) for ds in self.datasets]):
+            raise NotImplementedError(
+                'Evaluating concatenated CocoDataset as a whole is not'
+                ' supported! Please set "separate_eval=True"')
+        elif len(set([type(ds) for ds in self.datasets])) != 1:
+            raise NotImplementedError(
+                'All the datasets should have same types')
+        else:
+            original_data_infos = self.datasets[0].data_infos
+            self.datasets[0].data_infos = sum(
+                [dataset.data_infos for dataset in self.datasets], [])
+            eval_results = self.datasets[0].evaluate(
+                results, logger=logger, **kwargs)
+            self.datasets[0].data_infos = original_data_infos
+            return eval_results
+
+
+@DATASETS.register_module()
+class RepeatDataset:
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+        self.CLASSES = dataset.CLASSES
+        self.PALETTE = getattr(dataset, 'PALETTE', None)
+        if hasattr(self.dataset, 'flag'):
+            self.flag = np.tile(self.dataset.flag, times)
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        return self.dataset[idx % self._ori_len]
+
+    def get_cat_ids(self, idx):
+        """Get category ids of repeat dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        return self.dataset.get_cat_ids(idx % self._ori_len)
+
+    def get_ann_info(self, idx):
+        """Get annotation of repeat dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        return self.dataset.get_ann_info(idx % self._ori_len)
+
+    def __len__(self):
+        """Length after repetition."""
+        return self.times * self._ori_len
+
+
+# Modified from https://github.com/facebookresearch/detectron2/blob/41d475b75a230221e21d9cac5d69655e3415e3a4/detectron2/data/samplers/distributed_sampler.py#L57 # noqa
+@DATASETS.register_module()
+class ClassBalancedDataset:
+    """A wrapper of repeated dataset with repeat factor.
+
+    Suitable for training on class imbalanced datasets like LVIS. Following
+    the sampling strategy in the `paper <https://arxiv.org/abs/1908.03195>`_,
+    in each epoch, an image may appear multiple times based on its
+    "repeat factor".
+    The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1]
+    is defined by the fraction of images in the training set (without repeats)
+    in which category c appears.
+    The dataset needs to instantiate :func:`self.get_cat_ids` to support
+    ClassBalancedDataset.
+
+    The repeat factor is computed as followed.
+
+    1. For each category c, compute the fraction # of images
+       that contain it: :math:`f(c)`
+    2. For each category c, compute the category-level repeat factor:
+       :math:`r(c) = max(1, sqrt(t/f(c)))`
+    3. For each image I, compute the image-level repeat factor:
+       :math:`r(I) = max_{c in I} r(c)`
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be repeated.
+        oversample_thr (float): frequency threshold below which data is
+            repeated. For categories with ``f_c >= oversample_thr``, there is
+            no oversampling. For categories with ``f_c < oversample_thr``, the
+            degree of oversampling following the square-root inverse frequency
+            heuristic above.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes will not be oversampled. Otherwise, they will be categorized
+            as the pure background class and involved into the oversampling.
+            Default: True.
+    """
+
+    def __init__(self, dataset, oversample_thr, filter_empty_gt=True):
+        self.dataset = dataset
+        self.oversample_thr = oversample_thr
+        self.filter_empty_gt = filter_empty_gt
+        self.CLASSES = dataset.CLASSES
+        self.PALETTE = getattr(dataset, 'PALETTE', None)
+
+        repeat_factors = self._get_repeat_factors(dataset, oversample_thr)
+        repeat_indices = []
+        for dataset_idx, repeat_factor in enumerate(repeat_factors):
+            repeat_indices.extend([dataset_idx] * math.ceil(repeat_factor))
+        self.repeat_indices = repeat_indices
+
+        flags = []
+        if hasattr(self.dataset, 'flag'):
+            for flag, repeat_factor in zip(self.dataset.flag, repeat_factors):
+                flags.extend([flag] * int(math.ceil(repeat_factor)))
+            assert len(flags) == len(repeat_indices)
+        self.flag = np.asarray(flags, dtype=np.uint8)
+
+    def _get_repeat_factors(self, dataset, repeat_thr):
+        """Get repeat factor for each images in the dataset.
+
+        Args:
+            dataset (:obj:`CustomDataset`): The dataset
+            repeat_thr (float): The threshold of frequency. If an image
+                contains the categories whose frequency below the threshold,
+                it would be repeated.
+
+        Returns:
+            list[float]: The repeat factors for each images in the dataset.
+        """
+
+        # 1. For each category c, compute the fraction # of images
+        #   that contain it: f(c)
+        category_freq = defaultdict(int)
+        num_images = len(dataset)
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) == 0 and not self.filter_empty_gt:
+                cat_ids = set([len(self.CLASSES)])
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t/f(c)))
+        category_repeat = {
+            cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        repeat_factors = []
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) == 0 and not self.filter_empty_gt:
+                cat_ids = set([len(self.CLASSES)])
+            repeat_factor = 1
+            if len(cat_ids) > 0:
+                repeat_factor = max(
+                    {category_repeat[cat_id]
+                     for cat_id in cat_ids})
+            repeat_factors.append(repeat_factor)
+
+        return repeat_factors
+
+    def __getitem__(self, idx):
+        ori_index = self.repeat_indices[idx]
+        return self.dataset[ori_index]
+
+    def get_ann_info(self, idx):
+        """Get annotation of dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+        ori_index = self.repeat_indices[idx]
+        return self.dataset.get_ann_info(ori_index)
+
+    def __len__(self):
+        """Length after repetition."""
+        return len(self.repeat_indices)
+
+
+@DATASETS.register_module()
+class MultiImageMixDataset:
+    """A wrapper of multiple images mixed dataset.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process. At the same time, we provide the `dynamic_scale` parameter
+    to dynamically change the output image size.
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be mixed.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        dynamic_scale (tuple[int], optional): The image scale can be changed
+            dynamically. Default to None. It is deprecated.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Default: 15.
+    """
+
+    def __init__(self,
+                 dataset,
+                 pipeline,
+                 dynamic_scale=None,
+                 skip_type_keys=None,
+                 max_refetch=15):
+        if dynamic_scale is not None:
+            raise RuntimeError(
+                'dynamic_scale is deprecated. Please use Resize pipeline '
+                'to achieve similar functions')
+        assert isinstance(pipeline, collections.abc.Sequence)
+        if skip_type_keys is not None:
+            assert all([
+                isinstance(skip_type_key, str)
+                for skip_type_key in skip_type_keys
+            ])
+        self._skip_type_keys = skip_type_keys
+
+        self.pipeline = []
+        self.pipeline_types = []
+        for transform in pipeline:
+            if isinstance(transform, dict):
+                self.pipeline_types.append(transform['type'])
+                transform = build_from_cfg(transform, PIPELINES)
+                self.pipeline.append(transform)
+            else:
+                raise TypeError('pipeline must be a dict')
+
+        self.dataset = dataset
+        self.CLASSES = dataset.CLASSES
+        self.PALETTE = getattr(dataset, 'PALETTE', None)
+        if hasattr(self.dataset, 'flag'):
+            self.flag = dataset.flag
+        self.num_samples = len(dataset)
+        self.max_refetch = max_refetch
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        results = copy.deepcopy(self.dataset[idx])
+        for (transform, transform_type) in zip(self.pipeline,
+                                               self.pipeline_types):
+            if self._skip_type_keys is not None and \
+                    transform_type in self._skip_type_keys:
+                continue
+
+            if hasattr(transform, 'get_indexes'):
+                for i in range(self.max_refetch):
+                    # Make sure the results passed the loading pipeline
+                    # of the original dataset is not None.
+                    indexes = transform.get_indexes(self.dataset)
+                    if not isinstance(indexes, collections.abc.Sequence):
+                        indexes = [indexes]
+                    mix_results = [
+                        copy.deepcopy(self.dataset[index]) for index in indexes
+                    ]
+                    if None not in mix_results:
+                        results['mix_results'] = mix_results
+                        break
+                else:
+                    raise RuntimeError(
+                        'The loading pipeline of the original dataset'
+                        ' always return None. Please check the correctness '
+                        'of the dataset and its pipeline.')
+
+            for i in range(self.max_refetch):
+                # To confirm the results passed the training pipeline
+                # of the wrapper is not None.
+                updated_results = transform(copy.deepcopy(results))
+                if updated_results is not None:
+                    results = updated_results
+                    break
+            else:
+                raise RuntimeError(
+                    'The training pipeline of the dataset wrapper'
+                    ' always return None.Please check the correctness '
+                    'of the dataset and its pipeline.')
+
+            if 'mix_results' in results:
+                results.pop('mix_results')
+
+        return results
+
+    def update_skip_type_keys(self, skip_type_keys):
+        """Update skip_type_keys. It is called by an external hook.
+
+        Args:
+            skip_type_keys (list[str], optional): Sequence of type
+                string to be skip pipeline.
+        """
+        assert all([
+            isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
+        ])
+        self._skip_type_keys = skip_type_keys
diff --git a/mmdet/datasets/deepfashion.py b/mmdet/datasets/deepfashion.py
new file mode 100755
index 0000000..609f809
--- /dev/null
+++ b/mmdet/datasets/deepfashion.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(CocoDataset):
+
+    CLASSES = ('top', 'skirt', 'leggings', 'dress', 'outer', 'pants', 'bag',
+               'neckwear', 'headwear', 'eyeglass', 'belt', 'footwear', 'hair',
+               'skin', 'face')
+
+    PALETTE = [(0, 192, 64), (0, 64, 96), (128, 192, 192), (0, 64, 64),
+               (0, 192, 224), (0, 192, 192), (128, 192, 64), (0, 192, 96),
+               (128, 32, 192), (0, 0, 224), (0, 0, 64), (0, 160, 192),
+               (128, 0, 96), (128, 0, 192), (0, 32, 192)]
diff --git a/mmdet/datasets/diverseweather.py b/mmdet/datasets/diverseweather.py
new file mode 100755
index 0000000..8e88fda
--- /dev/null
+++ b/mmdet/datasets/diverseweather.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmcv.utils import print_log
+
+from mmdet.core import eval_map, eval_recalls
+from .builder import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class DiverseWeatherDataset(XMLDataset):
+
+    CLASSES = ('bus', 'bike', 'car', 'motor', 'person', 'rider', 'truck')
+
+    PALETTE = [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+               (197, 226, 255), (0, 60, 100), (0, 0, 142)]
+
+    def __init__(self, **kwargs):
+        super(DiverseWeatherDataset, self).__init__(**kwargs)
+        # if 'VOC2007' in self.img_prefix:
+        #     self.year = 2007
+        # elif 'VOC2012' in self.img_prefix:
+        #     self.year = 2012
+        # else:
+        #     raise ValueError('Cannot infer dataset year from img_prefix')
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thr=0.5,
+                 scale_ranges=None):
+        """Evaluate in VOC protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'mAP', 'recall'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thr (float | list[float]): IoU threshold. Default: 0.5.
+            scale_ranges (list[tuple], optional): Scale ranges for evaluating
+                mAP. If not specified, all bounding boxes would be included in
+                evaluation. Default: None.
+
+        Returns:
+            dict[str, float]: AP/recall metrics.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP', 'recall']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+        eval_results = OrderedDict()
+        iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+        if metric == 'mAP':
+            assert isinstance(iou_thrs, list)
+            # if self.year == 2007:
+            #     ds_name = 'voc07'
+            # else:
+            ds_name = self.CLASSES
+            mean_aps = []
+            for iou_thr in iou_thrs:
+                print_log(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+                # Follow the official implementation,
+                # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
+                # we should use the legacy coordinate system in mmdet 1.x,
+                # which means w, h should be computed as 'x2 - x1 + 1` and
+                # `y2 - y1 + 1`
+                mean_ap, _ = eval_map(
+                    results,
+                    annotations,
+                    scale_ranges=None,
+                    iou_thr=iou_thr,
+                    dataset=ds_name,
+                    logger=logger,
+                    use_legacy_coordinate=True)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+            eval_results.move_to_end('mAP', last=False)
+        elif metric == 'recall':
+            gt_bboxes = [ann['bboxes'] for ann in annotations]
+            recalls = eval_recalls(
+                gt_bboxes,
+                results,
+                proposal_nums,
+                iou_thrs,
+                logger=logger,
+                use_legacy_coordinate=True)
+            for i, num in enumerate(proposal_nums):
+                for j, iou_thr in enumerate(iou_thrs):
+                    eval_results[f'recall@{num}@{iou_thr}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
diff --git a/mmdet/datasets/lvis.py b/mmdet/datasets/lvis.py
new file mode 100755
index 0000000..5f6196e
--- /dev/null
+++ b/mmdet/datasets/lvis.py
@@ -0,0 +1,742 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import logging
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class LVISV05Dataset(CocoDataset):
+
+    CLASSES = (
+        'acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+        'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+        'antenna', 'apple', 'apple_juice', 'applesauce', 'apricot', 'apron',
+        'aquarium', 'armband', 'armchair', 'armoire', 'armor', 'artichoke',
+        'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award',
+        'awning', 'ax', 'baby_buggy', 'basketball_backboard', 'backpack',
+        'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball',
+        'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage',
+        'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel',
+        'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat',
+        'baseball_cap', 'baseball_glove', 'basket', 'basketball_hoop',
+        'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel',
+        'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball', 'bead',
+        'beaker', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed',
+        'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle', 'beer_can',
+        'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle', 'bench',
+        'beret', 'bib', 'Bible', 'bicycle', 'visor', 'binder', 'binoculars',
+        'bird', 'birdfeeder', 'birdbath', 'birdcage', 'birdhouse',
+        'birthday_cake', 'birthday_card', 'biscuit_(bread)', 'pirate_flag',
+        'black_sheep', 'blackboard', 'blanket', 'blazer', 'blender', 'blimp',
+        'blinker', 'blueberry', 'boar', 'gameboard', 'boat', 'bobbin',
+        'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt', 'bolt', 'bonnet',
+        'book', 'book_bag', 'bookcase', 'booklet', 'bookmark',
+        'boom_microphone', 'boot', 'bottle', 'bottle_opener', 'bouquet',
+        'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie', 'bowl',
+        'pipe_bowl', 'bowler_hat', 'bowling_ball', 'bowling_pin',
+        'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+        'bread-bin', 'breechcloth', 'bridal_gown', 'briefcase',
+        'bristle_brush', 'broccoli', 'broach', 'broom', 'brownie',
+        'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull',
+        'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board',
+        'bulletproof_vest', 'bullhorn', 'corned_beef', 'bun', 'bunk_bed',
+        'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butcher_knife',
+        'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+        'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+        'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+        'can', 'can_opener', 'candelabrum', 'candle', 'candle_holder',
+        'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'cannon',
+        'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap',
+        'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)',
+        'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan',
+        'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag',
+        'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast',
+        'cat', 'cauliflower', 'caviar', 'cayenne_(spice)', 'CD_player',
+        'celery', 'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue',
+        'champagne', 'chandelier', 'chap', 'checkbook', 'checkerboard',
+        'cherry', 'chessboard', 'chest_of_drawers_(furniture)',
+        'chicken_(animal)', 'chicken_wire', 'chickpea', 'Chihuahua',
+        'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)',
+        'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk',
+        'chocolate_mousse', 'choker', 'chopping_board', 'chopstick',
+        'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette',
+        'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent',
+        'clementine', 'clip', 'clipboard', 'clock', 'clock_tower',
+        'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat',
+        'coat_hanger', 'coatrack', 'cock', 'coconut', 'coffee_filter',
+        'coffee_maker', 'coffee_table', 'coffeepot', 'coil', 'coin',
+        'colander', 'coleslaw', 'coloring_material', 'combination_lock',
+        'pacifier', 'comic_book', 'computer_keyboard', 'concrete_mixer',
+        'cone', 'control', 'convertible_(automobile)', 'sofa_bed', 'cookie',
+        'cookie_jar', 'cooking_utensil', 'cooler_(for_food)',
+        'cork_(bottle_plug)', 'corkboard', 'corkscrew', 'edible_corn',
+        'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset',
+        'romaine_lettuce', 'costume', 'cougar', 'coverall', 'cowbell',
+        'cowboy_hat', 'crab_(animal)', 'cracker', 'crape', 'crate', 'crayon',
+        'cream_pitcher', 'credit_card', 'crescent_roll', 'crib', 'crock_pot',
+        'crossbar', 'crouton', 'crow', 'crown', 'crucifix', 'cruise_ship',
+        'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube',
+        'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupcake', 'hair_curler',
+        'curling_iron', 'curtain', 'cushion', 'custard', 'cutting_tool',
+        'cylinder', 'cymbal', 'dachshund', 'dagger', 'dartboard',
+        'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+        'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
+        'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+        'dishwasher_detergent', 'diskette', 'dispenser', 'Dixie_cup', 'dog',
+        'dog_collar', 'doll', 'dollar', 'dolphin', 'domestic_ass', 'eye_mask',
+        'doorbell', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+        'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+        'dresser', 'drill', 'drinking_fountain', 'drone', 'dropper',
+        'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+        'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan',
+        'Dutch_oven', 'eagle', 'earphone', 'earplug', 'earring', 'easel',
+        'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+        'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+        'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+        'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+        'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
+        'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
+        'fireplug', 'fish', 'fish_(food)', 'fishbowl', 'fishing_boat',
+        'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flash',
+        'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)',
+        'flower_arrangement', 'flute_glass', 'foal', 'folding_chair',
+        'food_processor', 'football_(American)', 'football_helmet',
+        'footstool', 'fork', 'forklift', 'freight_car', 'French_toast',
+        'freshener', 'frisbee', 'frog', 'fruit_juice', 'fruit_salad',
+        'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
+        'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
+        'gasmask', 'gazelle', 'gelatin', 'gemstone', 'giant_panda',
+        'gift_wrap', 'ginger', 'giraffe', 'cincture',
+        'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+        'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+        'gorilla', 'gourd', 'surgical_gown', 'grape', 'grasshopper', 'grater',
+        'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+        'grillroom', 'grinder_(tool)', 'grits', 'grizzly', 'grocery_bag',
+        'guacamole', 'guitar', 'gull', 'gun', 'hair_spray', 'hairbrush',
+        'hairnet', 'hairpin', 'ham', 'hamburger', 'hammer', 'hammock',
+        'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+        'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+        'hardback_book', 'harmonium', 'hat', 'hatbox', 'hatch', 'veil',
+        'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+        'headstall_(for_horses)', 'hearing_aid', 'heart', 'heater',
+        'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus',
+        'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood',
+        'hook', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+        'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+        'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+        'ice_tea', 'igniter', 'incense', 'inhaler', 'iPod',
+        'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jean',
+        'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewelry', 'joystick',
+        'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard',
+        'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten',
+        'kiwi_fruit', 'knee_pad', 'knife', 'knight_(chess_piece)',
+        'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat',
+        'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp',
+        'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer',
+        'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)',
+        'Lego', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy',
+        'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine',
+        'linen_paper', 'lion', 'lip_balm', 'lipstick', 'liquor', 'lizard',
+        'Loafer_(type_of_shoe)', 'log', 'lollipop', 'lotion',
+        'speaker_(stereo_equipment)', 'loveseat', 'machine_gun', 'magazine',
+        'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallet', 'mammoth',
+        'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini',
+        'mascot', 'mashed_potato', 'masher', 'mask', 'mast',
+        'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup',
+        'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone',
+        'microscope', 'microwave_oven', 'milestone', 'milk', 'minivan',
+        'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money',
+        'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+        'motor_scooter', 'motor_vehicle', 'motorboat', 'motorcycle',
+        'mound_(baseball)', 'mouse_(animal_rodent)',
+        'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+        'music_stool', 'musical_instrument', 'nailfile', 'nameplate', 'napkin',
+        'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newsstand',
+        'nightshirt', 'nosebag_(for_animals)', 'noseband_(for_animals)',
+        'notebook', 'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)',
+        'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion',
+        'orange_(fruit)', 'orange_juice', 'oregano', 'ostrich', 'ottoman',
+        'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle',
+        'padlock', 'paintbox', 'paintbrush', 'painting', 'pajamas', 'palette',
+        'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose',
+        'papaya', 'paperclip', 'paper_plate', 'paper_towel', 'paperback_book',
+        'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+        'parchment', 'parka', 'parking_meter', 'parrot',
+        'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+        'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+        'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'pegboard',
+        'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener',
+        'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper',
+        'pepper_mill', 'perfume', 'persimmon', 'baby', 'pet', 'petfood',
+        'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+        'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+        'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+        'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+        'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+        'plate', 'platter', 'playing_card', 'playpen', 'pliers',
+        'plow_(farm_equipment)', 'pocket_watch', 'pocketknife',
+        'poker_(fire_stirring_tool)', 'pole', 'police_van', 'polo_shirt',
+        'poncho', 'pony', 'pool_table', 'pop_(soda)', 'portrait',
+        'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
+        'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'printer',
+        'projectile_(weapon)', 'projector', 'propeller', 'prune', 'pudding',
+        'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher', 'puppet',
+        'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit', 'race_car',
+        'racket', 'radar', 'radiator', 'radio_receiver', 'radish', 'raft',
+        'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+        'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+        'recliner', 'record_player', 'red_cabbage', 'reflector',
+        'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring',
+        'river_boat', 'road_map', 'robe', 'rocking_chair', 'roller_skate',
+        'Rollerblade', 'rolling_pin', 'root_beer',
+        'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)',
+        'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag',
+        'safety_pin', 'sail', 'salad', 'salad_plate', 'salami',
+        'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker',
+        'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer',
+        'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)',
+        'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard',
+        'scrambled_eggs', 'scraper', 'scratcher', 'screwdriver',
+        'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+        'seashell', 'seedling', 'serving_dish', 'sewing_machine', 'shaker',
+        'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)',
+        'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog',
+        'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag', 'shopping_cart',
+        'short_pants', 'shot_glass', 'shoulder_bag', 'shovel', 'shower_head',
+        'shower_curtain', 'shredder_(for_paper)', 'sieve', 'signboard', 'silo',
+        'sink', 'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka',
+        'ski_pole', 'skirt', 'sled', 'sleeping_bag', 'sling_(bandage)',
+        'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+        'snowmobile', 'soap', 'soccer_ball', 'sock', 'soda_fountain',
+        'carbonated_water', 'sofa', 'softball', 'solar_array', 'sombrero',
+        'soup', 'soup_bowl', 'soupspoon', 'sour_cream', 'soya_milk',
+        'space_shuttle', 'sparkler_(fireworks)', 'spatula', 'spear',
+        'spectacles', 'spice_rack', 'spider', 'sponge', 'spoon', 'sportswear',
+        'spotlight', 'squirrel', 'stapler_(stapling_machine)', 'starfish',
+        'statue_(sculpture)', 'steak_(food)', 'steak_knife',
+        'steamer_(kitchen_appliance)', 'steering_wheel', 'stencil',
+        'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
+        'stirrup', 'stockings_(leg_wear)', 'stool', 'stop_sign', 'brake_light',
+        'stove', 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+        'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+        'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+        'sunglasses', 'sunhat', 'sunscreen', 'surfboard', 'sushi', 'mop',
+        'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato',
+        'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table',
+        'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag',
+        'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)',
+        'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+        'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+        'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+        'telephone_pole', 'telephoto_lens', 'television_camera',
+        'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+        'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+        'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
+        'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
+        'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
+        'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
+        'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
+        'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
+        'tray', 'tree_house', 'trench_coat', 'triangle_(musical_instrument)',
+        'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)',
+        'trunk', 'vat', 'turban', 'turkey_(bird)', 'turkey_(food)', 'turnip',
+        'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella',
+        'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'valve',
+        'vase', 'vending_machine', 'vent', 'videotape', 'vinegar', 'violin',
+        'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon',
+        'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet',
+        'walrus', 'wardrobe', 'wasabi', 'automatic_washer', 'watch',
+        'water_bottle', 'water_cooler', 'water_faucet', 'water_filter',
+        'water_heater', 'water_jug', 'water_gun', 'water_scooter', 'water_ski',
+        'water_tower', 'watering_can', 'watermelon', 'weathervane', 'webcam',
+        'wedding_cake', 'wedding_ring', 'wet_suit', 'wheel', 'wheelchair',
+        'whipped_cream', 'whiskey', 'whistle', 'wick', 'wig', 'wind_chime',
+        'windmill', 'window_box_(for_plants)', 'windshield_wiper', 'windsock',
+        'wine_bottle', 'wine_bucket', 'wineglass', 'wing_chair',
+        'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon', 'wreath',
+        'wrench', 'wristband', 'wristlet', 'yacht', 'yak', 'yogurt',
+        'yoke_(animal_equipment)', 'zebra', 'zucchini')
+
+    PALETTE = None
+
+    def load_annotations(self, ann_file):
+        """Load annotation from lvis style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from LVIS api.
+        """
+
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        self.coco = LVIS(ann_file)
+        self.cat_ids = self.coco.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            if info['file_name'].startswith('COCO'):
+                # Convert form the COCO 2014 file naming convention of
+                # COCO_[train/val/test]2014_000000000000.jpg to the 2017
+                # naming convention of 000000000000.jpg
+                # (LVIS v1 will fix this naming issue)
+                info['filename'] = info['file_name'][-16:]
+            else:
+                info['filename'] = info['file_name']
+            data_infos.append(info)
+        return data_infos
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        """Evaluation in LVIS protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None):
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float]): IoU threshold used for evaluating
+                recalls. If set to a list, the average recall of all IoUs will
+                also be computed. Default: 0.5.
+
+        Returns:
+            dict[str, float]: LVIS style metrics.
+        """
+
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVISEval, LVISResults
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError('metric {} is not supported'.format(metric))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_files = self.results2json(results, jsonfile_prefix)
+
+        eval_results = OrderedDict()
+        # get original api
+        lvis_gt = self.coco
+        for metric in metrics:
+            msg = 'Evaluating {}...'.format(metric)
+            if logger is None:
+                msg = '\n' + msg
+            print_log(msg, logger=logger)
+
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    results, proposal_nums, iou_thrs, logger='silent')
+                log_msg = []
+                for i, num in enumerate(proposal_nums):
+                    eval_results['AR@{}'.format(num)] = ar[i]
+                    log_msg.append('\nAR@{}\t{:.4f}'.format(num, ar[i]))
+                log_msg = ''.join(log_msg)
+                print_log(log_msg, logger=logger)
+                continue
+
+            if metric not in result_files:
+                raise KeyError('{} is not in results'.format(metric))
+            try:
+                lvis_dt = LVISResults(lvis_gt, result_files[metric])
+            except IndexError:
+                print_log(
+                    'The testing results of the whole dataset is empty.',
+                    logger=logger,
+                    level=logging.ERROR)
+                break
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            lvis_eval = LVISEval(lvis_gt, lvis_dt, iou_type)
+            lvis_eval.params.imgIds = self.img_ids
+            if metric == 'proposal':
+                lvis_eval.params.useCats = 0
+                lvis_eval.params.maxDets = list(proposal_nums)
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                for k, v in lvis_eval.get_results().items():
+                    if k.startswith('AR'):
+                        val = float('{:.4f}'.format(float(v)))
+                        eval_results[k] = val
+            else:
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                lvis_results = lvis_eval.get_results()
+                if classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = lvis_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        # the dimensions of precisions are
+                        # [num_thrs, num_recalls, num_cats, num_area_rngs]
+                        nm = self.coco.load_cats([catId])[0]
+                        precision = precisions[:, :, idx, 0]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    print_log('\n' + table.table, logger=logger)
+
+                for k, v in lvis_results.items():
+                    if k.startswith('AP'):
+                        key = '{}_{}'.format(metric, k)
+                        val = float('{:.4f}'.format(float(v)))
+                        eval_results[key] = val
+                ap_summary = ' '.join([
+                    '{}:{:.4f}'.format(k, float(v))
+                    for k, v in lvis_results.items() if k.startswith('AP')
+                ])
+                eval_results['{}_mAP_copypaste'.format(metric)] = ap_summary
+            lvis_eval.print_results()
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
+
+
+LVISDataset = LVISV05Dataset
+DATASETS.register_module(name='LVISDataset', module=LVISDataset)
+
+
+@DATASETS.register_module()
+class LVISV1Dataset(LVISDataset):
+
+    CLASSES = (
+        'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', 'alcohol',
+        'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', 'antenna',
+        'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+        'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+        'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+        'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+        'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+        'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+        'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+        'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+        'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+        'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+        'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+        'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+        'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+        'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+        'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+        'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+        'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+        'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+        'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+        'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+        'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+        'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+        'bottle_opener', 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)',
+        'bow-tie', 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'box',
+        'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+        'bread-bin', 'bread', 'breechcloth', 'bridal_gown', 'briefcase',
+        'broccoli', 'broach', 'broom', 'brownie', 'brussels_sprouts',
+        'bubble_gum', 'bucket', 'horse_buggy', 'bull', 'bulldog', 'bulldozer',
+        'bullet_train', 'bulletin_board', 'bulletproof_vest', 'bullhorn',
+        'bun', 'bunk_bed', 'buoy', 'burrito', 'bus_(vehicle)', 'business_card',
+        'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+        'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+        'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+        'can', 'can_opener', 'candle', 'candle_holder', 'candy_bar',
+        'candy_cane', 'walking_cane', 'canister', 'canoe', 'cantaloup',
+        'canteen', 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+        'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+        'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+        'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+        'cash_register', 'casserole', 'cassette', 'cast', 'cat', 'cauliflower',
+        'cayenne_(spice)', 'CD_player', 'celery', 'cellular_telephone',
+        'chain_mail', 'chair', 'chaise_longue', 'chalice', 'chandelier',
+        'chap', 'checkbook', 'checkerboard', 'cherry', 'chessboard',
+        'chicken_(animal)', 'chickpea', 'chili_(vegetable)', 'chime',
+        'chinaware', 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+        'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+        'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+        'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+        'clasp', 'cleansing_agent', 'cleat_(for_securing_rope)', 'clementine',
+        'clip', 'clipboard', 'clippers_(for_plants)', 'cloak', 'clock',
+        'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster',
+        'coat', 'coat_hanger', 'coatrack', 'cock', 'cockroach',
+        'cocoa_(beverage)', 'coconut', 'coffee_maker', 'coffee_table',
+        'coffeepot', 'coil', 'coin', 'colander', 'coleslaw',
+        'coloring_material', 'combination_lock', 'pacifier', 'comic_book',
+        'compass', 'computer_keyboard', 'condiment', 'cone', 'control',
+        'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+        'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+        'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+        'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+        'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+        'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+        'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+        'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+        'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+        'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+        'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+        'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+        'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
+        'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+        'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+        'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+        'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+        'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+        'dresser', 'drill', 'drone', 'dropper', 'drum_(musical_instrument)',
+        'drumstick', 'duck', 'duckling', 'duct_tape', 'duffel_bag', 'dumbbell',
+        'dumpster', 'dustpan', 'eagle', 'earphone', 'earplug', 'earring',
+        'easel', 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+        'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+        'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+        'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+        'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
+        'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
+        'fireplug', 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl',
+        'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flap',
+        'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+        'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+        'folding_chair', 'food_processor', 'football_(American)',
+        'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+        'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+        'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
+        'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
+        'gasmask', 'gazelle', 'gelatin', 'gemstone', 'generator',
+        'giant_panda', 'gift_wrap', 'ginger', 'giraffe', 'cincture',
+        'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+        'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+        'gorilla', 'gourd', 'grape', 'grater', 'gravestone', 'gravy_boat',
+        'green_bean', 'green_onion', 'griddle', 'grill', 'grits', 'grizzly',
+        'grocery_bag', 'guitar', 'gull', 'gun', 'hairbrush', 'hairnet',
+        'hairpin', 'halter_top', 'ham', 'hamburger', 'hammer', 'hammock',
+        'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+        'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+        'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', 'headband',
+        'headboard', 'headlight', 'headscarf', 'headset',
+        'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+        'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+        'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+        'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+        'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+        'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+        'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+        'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+        'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+        'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+        'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+        'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+        'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+        'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+        'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+        'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', 'lettuce',
+        'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+        'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+        'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat',
+        'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+        'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', 'manger',
+        'manhole', 'map', 'marker', 'martini', 'mascot', 'mashed_potato',
+        'masher', 'mask', 'mast', 'mat_(gym_equipment)', 'matchbox',
+        'mattress', 'measuring_cup', 'measuring_stick', 'meatball', 'medicine',
+        'melon', 'microphone', 'microscope', 'microwave_oven', 'milestone',
+        'milk', 'milk_can', 'milkshake', 'minivan', 'mint_candy', 'mirror',
+        'mitten', 'mixer_(kitchen_tool)', 'money',
+        'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+        'motor_scooter', 'motor_vehicle', 'motorcycle', 'mound_(baseball)',
+        'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+        'music_stool', 'musical_instrument', 'nailfile', 'napkin',
+        'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newspaper',
+        'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+        'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+        'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+        'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'ostrich',
+        'ottoman', 'oven', 'overalls_(clothing)', 'owl', 'packet', 'inkpad',
+        'pad', 'paddle', 'padlock', 'paintbrush', 'painting', 'pajamas',
+        'palette', 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake',
+        'pantyhose', 'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+        'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', 'parasol',
+        'parchment', 'parka', 'parking_meter', 'parrot',
+        'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+        'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+        'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+        'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+        'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+        'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+        'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+        'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+        'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+        'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+        'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+        'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+        'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+        'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+        'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
+        'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'pretzel',
+        'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+        'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+        'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+        'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+        'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+        'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+        'recliner', 'record_player', 'reflector', 'remote_control',
+        'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+        'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+        'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+        'rubber_band', 'runner_(carpet)', 'plastic_bag',
+        'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+        'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+        'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+        'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+        'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+        'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+        'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+        'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+        'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+        'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+        'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+        'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+        'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+        'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+        'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+        'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+        'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+        'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+        'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+        'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+        'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+        'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+        'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+        'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
+        'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', 'strainer',
+        'strap', 'straw_(for_drinking)', 'strawberry', 'street_sign',
+        'streetlight', 'string_cheese', 'stylus', 'subwoofer', 'sugar_bowl',
+        'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', 'sunglasses',
+        'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', 'sweatband',
+        'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', 'sword',
+        'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+        'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+        'tambourine', 'army_tank', 'tank_(storage_vessel)',
+        'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+        'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+        'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+        'telephone_pole', 'telephoto_lens', 'television_camera',
+        'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+        'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+        'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
+        'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
+        'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
+        'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
+        'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
+        'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
+        'tray', 'trench_coat', 'triangle_(musical_instrument)', 'tricycle',
+        'tripod', 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat',
+        'turban', 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+        'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+        'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+        'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+        'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+        'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+        'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+        'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+        'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+        'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+        'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+        'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+        'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+        'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+        'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+        'yoke_(animal_equipment)', 'zebra', 'zucchini')
+
+    def load_annotations(self, ann_file):
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        self.coco = LVIS(ann_file)
+        self.cat_ids = self.coco.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            # coco_url is used in LVISv1 instead of file_name
+            # e.g. http://images.cocodataset.org/train2017/000000391895.jpg
+            # train/val split in specified in url
+            info['filename'] = info['coco_url'].replace(
+                'http://images.cocodataset.org/', '')
+            data_infos.append(info)
+        return data_infos
diff --git a/mmdet/datasets/objects365.py b/mmdet/datasets/objects365.py
new file mode 100755
index 0000000..930f470
--- /dev/null
+++ b/mmdet/datasets/objects365.py
@@ -0,0 +1,232 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+from .api_wrappers import COCO
+from .builder import DATASETS
+from .coco import CocoDataset
+
+# images exist in annotations but not in image folder.
+objv2_ignore_list = [
+    osp.join('patch16', 'objects365_v2_00908726.jpg'),
+    osp.join('patch6', 'objects365_v1_00320532.jpg'),
+    osp.join('patch6', 'objects365_v1_00320534.jpg'),
+]
+
+
+@DATASETS.register_module()
+class Objects365V1Dataset(CocoDataset):
+    """Objects365 v1 dataset for detection."""
+    CLASSES = (
+        'person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+        'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+        'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes',
+        'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'tv',
+        'storage box', 'vase', 'bench', 'wine glass', 'boots', 'bowl',
+        'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can',
+        'stool', 'backpack', 'couch', 'belt', 'carpet', 'basket',
+        'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table', 'suv',
+        'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone',
+        'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle',
+        'bread', 'high heels', 'ring', 'van', 'watch', 'sink', 'horse', 'fish',
+        'apple', 'camera', 'candle', 'teddy bear', 'cake', 'motorcycle',
+        'wild bird', 'laptop', 'knife', 'traffic sign', 'cell phone', 'paddle',
+        'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus',
+        'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone',
+        'tea pot', 'keyboard', 'tripod', 'hockey', 'fan', 'dog', 'spoon',
+        'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal',
+        'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane',
+        'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote',
+        'baseball glove', 'paper towel', 'refrigerator', 'train', 'tomato',
+        'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone',
+        'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine',
+        'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove',
+        'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat',
+        'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun',
+        'life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck',
+        'sports car', 'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator',
+        'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+        'cutting/chopping board', 'tennis racket', 'candy',
+        'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+        'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+        'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+        'american football', 'basketball', 'potato', 'paint brush', 'printer',
+        'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+        'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+        'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+        'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+        'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+        'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+        'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+        'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+        'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+        'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+        'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+        'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+        'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+        'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+        'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+        'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+        'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+        'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope',
+        'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut',
+        'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly',
+        'egg tart', 'cheese', 'pig', 'pomelo', 'race car', 'rice cooker',
+        'tuba', 'crosswalk sign', 'papaya', 'hair drier', 'green onion',
+        'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill',
+        'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup',
+        'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish',
+        'noodles', 'yak', 'mop', 'crab', 'microscope', 'barbell', 'bread/bun',
+        'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'seal',
+        'mangosteen', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case',
+        'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey',
+        'durian', 'game board', 'rabbit', 'french horn', 'ambulance',
+        'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon',
+        'chainsaw', 'lobster', 'iron', 'flashlight')
+
+    PALETTE = None
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+
+        self.coco = COCO(ann_file)
+        # 'categories' list in objects365_train.json and objects365_val.
+        # json is inconsistent, need sorted list(or dict) before get cat_ids.
+        cats = self.coco.cats
+        sorted_cats = {i: cats[i] for i in sorted(cats)}
+        self.coco.cats = sorted_cats
+        categories = self.coco.dataset['categories']
+        sorted_categories = sorted(categories, key=lambda i: i['id'])
+        self.coco.dataset['categories'] = sorted_categories
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        total_ann_ids = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            info['filename'] = info['file_name']
+            data_infos.append(info)
+            ann_ids = self.coco.get_ann_ids(img_ids=[i])
+            total_ann_ids.extend(ann_ids)
+        assert len(set(total_ann_ids)) == len(
+            total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
+        return data_infos
+
+
+@DATASETS.register_module()
+class Objects365V2Dataset(CocoDataset):
+    """Objects365 v2 dataset for detection."""
+
+    CLASSES = (
+        'Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+        'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+        'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+        'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+        'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+        'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+        'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+        'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+        'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+        'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle', 'Guitar',
+        'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', 'Truck',
+        'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', 'Stuffed Toy',
+        'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed', 'Faucet', 'Tent',
+        'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple', 'Air Conditioner',
+        'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck', 'Fork',
+        'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon',
+        'Clock', 'Pot', 'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+        'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+        'Toiletry', 'Keyboard', 'Tomato', 'Lantern',
+        'Machinery Vehicle', 'Fan', 'Green Vegetables', 'Banana',
+        'Baseball Glove', 'Airplane', 'Mouse', 'Train', 'Pumpkin', 'Soccer',
+        'Skiboard', 'Luggage', 'Nightstand', 'Tea pot', 'Telephone', 'Trolley',
+        'Head Phone', 'Sports Car', 'Stop Sign', 'Dessert', 'Scooter',
+        'Stroller', 'Crane', 'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck',
+        'Baseball Bat', 'Surveillance Camera', 'Cat', 'Jug', 'Broccoli',
+        'Piano', 'Pizza', 'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+        'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+        'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+        'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+        'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+        'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+        'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+        'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', 'Violin',
+        'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck', 'Billards',
+        'Converter', 'Bathtub', 'Wheelchair', 'Golf Club', 'Briefcase',
+        'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear', 'Heavy Truck',
+        'Hamburger', 'Extractor', 'Extention Cord', 'Tong', 'Tennis Racket',
+        'Folder', 'American Football', 'earphone', 'Mask', 'Kettle', 'Tennis',
+        'Ship', 'Swing', 'Coffee Machine', 'Slide', 'Carriage', 'Onion',
+        'Green beans', 'Projector', 'Frisbee',
+        'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+        'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+        'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+        'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+        'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+        'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+        'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+        'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+        'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+        'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+        'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+        'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+        'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+        'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+        'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+        'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+        'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker', 'Tuba',
+        'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal', 'Buttefly',
+        'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', 'Electric Drill',
+        'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', 'Lighter',
+        'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi', 'Target',
+        'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case', 'Yak',
+        'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+        'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+        'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+        'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+        'Table Tennis ')
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+
+        self.coco = COCO(ann_file)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        total_ann_ids = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            file_name = osp.join(
+                osp.split(osp.split(info['file_name'])[0])[-1],
+                osp.split(info['file_name'])[-1])
+            info['file_name'] = file_name
+            if info['file_name'] in objv2_ignore_list:
+                continue
+            info['filename'] = info['file_name']
+            data_infos.append(info)
+            ann_ids = self.coco.get_ann_ids(img_ids=[i])
+            total_ann_ids.extend(ann_ids)
+        assert len(set(total_ann_ids)) == len(
+            total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
+        return data_infos
diff --git a/mmdet/datasets/openimages.py b/mmdet/datasets/openimages.py
new file mode 100755
index 0000000..1315349
--- /dev/null
+++ b/mmdet/datasets/openimages.py
@@ -0,0 +1,891 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import csv
+import json
+import os.path as osp
+import warnings
+from collections import OrderedDict, defaultdict
+
+import mmcv
+import numpy as np
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+from mmcv.utils import print_log
+
+from mmdet.core import eval_map
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class OpenImagesDataset(CustomDataset):
+    """Open Images dataset for detection.
+
+    Args:
+        ann_file (str): Annotation file path.
+        label_file (str): File path of the label description file that
+            maps the classes names in MID format to their short
+            descriptions.
+        image_level_ann_file (str): Image level annotation, which is used
+            in evaluation.
+        get_supercategory (bool): Whether to get parent class of the
+            current class. Default: True.
+        hierarchy_file (str): The file path of the class hierarchy.
+            Default: None.
+        get_metas (bool): Whether to get image metas in testing or
+            validation time. This should be `True` during evaluation.
+            Default: True. The OpenImages annotations do not have image
+            metas (width and height of the image), which will be used
+            during evaluation. We provide two ways to get image metas
+            in `OpenImagesDataset`:
+
+            - 1. `load from file`: Load image metas from pkl file, which
+              is suggested to use. We provided a script to get image metas:
+              `tools/misc/get_image_metas.py`, which need to run
+              this script before training/testing. Please refer to
+              `config/openimages/README.md` for more details.
+
+            - 2. `load from pipeline`, which will get image metas during
+              test time. However, this may reduce the inference speed,
+              especially when using distribution.
+
+        load_from_file (bool): Whether to get image metas from pkl file.
+        meta_file (str): File path to get image metas.
+        filter_labels (bool): Whether filter unannotated classes.
+            Default: True.
+        load_image_level_labels (bool): Whether load and consider image
+            level labels during evaluation. Default: True.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 label_file='',
+                 image_level_ann_file='',
+                 get_supercategory=True,
+                 hierarchy_file=None,
+                 get_metas=True,
+                 load_from_file=True,
+                 meta_file='',
+                 filter_labels=True,
+                 load_image_level_labels=True,
+                 file_client_args=dict(backend='disk'),
+                 **kwargs):
+        # may get error if use other file_client
+        self.file_client_args = file_client_args
+
+        self.cat2label = defaultdict(str)
+        self.index_dict = {}
+
+        # Although it will init file_client in `CustomDataset`,
+        # it needs to be init here.
+        file_client = mmcv.FileClient(**file_client_args)
+        # need get `index_dict` before load annotations
+        assert label_file.endswith('csv')
+        if hasattr(file_client, 'get_local_path'):
+            with file_client.get_local_path(label_file) as local_path:
+                class_names = self.get_classes_from_csv(local_path)
+        else:
+            class_names = self.get_classes_from_csv(label_file)
+        super(OpenImagesDataset, self).__init__(
+            ann_file=ann_file, file_client_args=file_client_args, **kwargs)
+        self.CLASSES = class_names
+        self.image_level_ann_file = image_level_ann_file
+        self.load_image_level_labels = load_image_level_labels
+        if get_supercategory is True:
+            assert hierarchy_file is not None
+            if self.__class__.__name__ == 'OpenImagesDataset':
+                assert hierarchy_file.endswith('json')
+            elif self.__class__.__name__ == 'OpenImagesChallengeDataset':
+                assert hierarchy_file.endswith('np')
+            else:
+                raise NotImplementedError
+            if hasattr(self.file_client, 'get_local_path'):
+                with self.file_client.get_local_path(
+                        hierarchy_file) as local_path:
+                    self.class_label_tree = self.get_relation_matrix(
+                        local_path)
+            else:
+                self.class_label_tree = self.get_relation_matrix(
+                    hierarchy_file)
+        self.get_supercategory = get_supercategory
+        self.get_metas = get_metas
+        self.load_from_file = load_from_file
+        self.meta_file = meta_file
+        if self.data_root is not None:
+            if not osp.isabs(self.meta_file):
+                self.meta_file = osp.join(self.data_root, self.meta_file)
+        self.filter_labels = filter_labels
+        self.rank, self.world_size = get_dist_info()
+        self.temp_img_metas = []
+        self.test_img_metas = []
+        self.test_img_shapes = []
+        self.load_from_pipeline = False if load_from_file else True
+
+    def get_classes_from_csv(self, label_file):
+        """Get classes name from file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            list[str]: Class name of OpenImages.
+        """
+
+        index_list = []
+        classes_names = []
+        with open(label_file, 'r') as f:
+            reader = csv.reader(f)
+            for line in reader:
+                self.cat2label[line[0]] = line[1]
+                classes_names.append(line[1])
+                index_list.append(line[0])
+        self.index_dict = {index: i for i, index in enumerate(index_list)}
+        return classes_names
+
+    def load_annotations(self, ann_file):
+        """Load annotation from annotation file.
+
+        Special described `self.data_infos` (defaultdict[list[dict]])
+        in this function: Annotations where item of the defaultdict
+        indicates an image, each of which has (n) dicts. Keys of dicts are:
+
+            - `bbox` (list): coordinates of the box, in normalized image
+              coordinates, of shape 4.
+            - `label` (int): the label id.
+            - `is_group_of` (bool):  Indicates that the box spans a group
+              of objects (e.g., a bed of flowers or a crowd of people).
+            - `is_occluded` (bool): Indicates that the object is occluded
+              by another object in the image.
+            - `is_truncated` (bool): Indicates that the object extends
+              beyond the boundary of the image.
+            - `is_depiction` (bool): Indicates that the object is a
+              depiction.
+            - `is_inside` (bool): Indicates a picture taken from the
+              inside of the object.
+
+        Args:
+            ann_file (str): CSV style annotation file path.
+
+        Returns:
+            list[dict]:  Data infos where each item of the list
+            indicates an image. Keys of annotations are:
+
+                - `img_id` (str): Image name.
+                - `filename` (str): Image name with suffix.
+        """
+        self.ann_infos = defaultdict(list)
+        data_infos = []
+        cp_filename = None
+        with open(ann_file, 'r') as f:
+            reader = csv.reader(f)
+            for i, line in enumerate(reader):
+                if i == 0:
+                    continue
+                img_id = line[0]
+                filename = f'{img_id}.jpg'
+                label_id = line[2]
+                assert label_id in self.index_dict
+                label = int(self.index_dict[label_id])
+                bbox = [
+                    float(line[4]),  # xmin
+                    float(line[6]),  # ymin
+                    float(line[5]),  # xmax
+                    float(line[7])  # ymax
+                ]
+                is_occluded = True if int(line[8]) == 1 else False
+                is_truncated = True if int(line[9]) == 1 else False
+                is_group_of = True if int(line[10]) == 1 else False
+                is_depiction = True if int(line[11]) == 1 else False
+                is_inside = True if int(line[12]) == 1 else False
+
+                self.ann_infos[img_id].append(
+                    dict(
+                        bbox=bbox,
+                        label=label,
+                        is_occluded=is_occluded,
+                        is_truncated=is_truncated,
+                        is_group_of=is_group_of,
+                        is_depiction=is_depiction,
+                        is_inside=is_inside))
+                if filename != cp_filename:
+                    data_infos.append(dict(img_id=img_id, filename=filename))
+                    cp_filename = filename
+        return data_infos
+
+    def get_ann_info(self, idx):
+        """Get OpenImages annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+        img_id = self.data_infos[idx]['img_id']
+        bboxes = []
+        labels = []
+        bboxes_ignore = []
+        labels_ignore = []
+        is_occludeds = []
+        is_truncateds = []
+        is_group_ofs = []
+        is_depictions = []
+        is_insides = []
+        for obj in self.ann_infos[img_id]:
+            label = int(obj['label'])
+            bbox = [
+                float(obj['bbox'][0]),
+                float(obj['bbox'][1]),
+                float(obj['bbox'][2]),
+                float(obj['bbox'][3])
+            ]
+            bboxes.append(bbox)
+            labels.append(label)
+
+            # Other parameters
+            is_occludeds.append(obj['is_occluded'])
+            is_truncateds.append(obj['is_truncated'])
+            is_group_ofs.append(obj['is_group_of'])
+            is_depictions.append(obj['is_depiction'])
+            is_insides.append(obj['is_inside'])
+        if not bboxes:
+            bboxes = np.zeros((0, 4))
+            labels = np.zeros((0, ))
+        else:
+            bboxes = np.array(bboxes)
+            labels = np.array(labels)
+        if not bboxes_ignore:
+            bboxes_ignore = np.zeros((0, 4))
+            labels_ignore = np.zeros((0, ))
+        else:
+            bboxes_ignore = np.array(bboxes_ignore)
+            labels_ignore = np.array(labels_ignore)
+
+        assert len(is_group_ofs) == len(labels) == len(bboxes)
+        gt_is_group_ofs = np.array(is_group_ofs, dtype=bool)
+
+        # These parameters is not used yet.
+        is_occludeds = np.array(is_occludeds, dtype=bool)
+        is_truncateds = np.array(is_truncateds, dtype=bool)
+        is_depictions = np.array(is_depictions, dtype=bool)
+        is_insides = np.array(is_insides, dtype=bool)
+
+        ann = dict(
+            bboxes=bboxes.astype(np.float32),
+            labels=labels.astype(np.int64),
+            bboxes_ignore=bboxes_ignore.astype(np.float32),
+            labels_ignore=labels_ignore.astype(np.int64),
+            gt_is_group_ofs=gt_is_group_ofs,
+            is_occludeds=is_occludeds,
+            is_truncateds=is_truncateds,
+            is_depictions=is_depictions,
+            is_insides=is_insides)
+
+        return ann
+
+    def get_meta_from_file(self, meta_file=''):
+        """Get image metas from pkl file."""
+        metas = mmcv.load(
+            meta_file,
+            file_format='pkl',
+            file_client_args=self.file_client_args)
+        assert len(metas) == len(self)
+        for i in range(len(metas)):
+            file_name = osp.split(metas[i]['filename'])[-1]
+            img_info = self.data_infos[i].get('img_info', None)
+            if img_info is not None:
+                assert file_name == osp.split(img_info['filename'])[-1]
+            else:
+                assert file_name == self.data_infos[i]['filename']
+            hw = metas[i]['ori_shape'][:2]
+            self.test_img_shapes.append(hw)
+
+    def get_meta_from_pipeline(self, results):
+        """Get image metas from pipeline."""
+        self.temp_img_metas.extend(results['img_metas'])
+        if dist.is_available() and self.world_size > 1:
+            from mmdet.apis.test import collect_results_cpu
+
+            self.test_img_metas = collect_results_cpu(self.temp_img_metas,
+                                                      len(self))
+        else:
+            self.test_img_metas = self.temp_img_metas
+
+    def get_img_shape(self, metas):
+        """Set images original shape into data_infos."""
+        assert len(metas) == len(self)
+        for i in range(len(metas)):
+            file_name = osp.split(metas[i].data['ori_filename'])[-1]
+            img_info = self.data_infos[i].get('img_info', None)
+            if img_info is not None:
+                assert file_name == osp.split(img_info['filename'])[-1]
+            else:
+                assert file_name == self.data_infos[i]['filename']
+            hw = metas[i].data['ori_shape'][:2]
+            self.test_img_shapes.append(hw)
+
+    def prepare_test_img(self, idx):
+        """Get testing data after pipeline."""
+        img_info = self.data_infos[idx]
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        results = self.pipeline(results)
+        if self.get_metas and self.load_from_pipeline:
+            self.get_meta_from_pipeline(results)
+        return results
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small."""
+        if self.filter_empty_gt:
+            warnings.warn('OpenImageDatasets does not support '
+                          'filtering empty gt images.')
+        valid_inds = [i for i in range(len(self))]
+        return valid_inds
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio."""
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+        # TODO: set flag without width and height
+
+    def get_relation_matrix(self, hierarchy_file):
+        """Get hierarchy for classes.
+
+        Args:
+            hierarchy_file (sty): File path to the hierarchy for classes.
+
+        Returns:
+            ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """
+
+        if self.data_root is not None:
+            if not osp.isabs(hierarchy_file):
+                hierarchy_file = osp.join(self.data_root, hierarchy_file)
+        with open(hierarchy_file, 'r') as f:
+            hierarchy = json.load(f)
+        class_num = len(self.CLASSES)
+        class_label_tree = np.eye(class_num, class_num)
+        class_label_tree = self._convert_hierarchy_tree(
+            hierarchy, class_label_tree)
+        return class_label_tree
+
+    def _convert_hierarchy_tree(self,
+                                hierarchy_map,
+                                class_label_tree,
+                                parents=[],
+                                get_all_parents=True):
+        """Get matrix of the corresponding relationship between the parent
+        class and the child class.
+
+        Args:
+            hierarchy_map (dict): Including label name and corresponding
+                subcategory. Keys of dicts are:
+
+                - `LabeName` (str): Name of the label.
+                - `Subcategory` (dict | list): Corresponding subcategory(ies).
+            class_label_tree (ndarray): The matrix of the corresponding
+                relationship between the parent class and the child class,
+                of shape (class_num, class_num).
+            parents (list): Corresponding parent class.
+            get_all_parents (bool): Whether get all parent names.
+                Default: True
+
+        Returns:
+            ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """
+
+        if 'Subcategory' in hierarchy_map:
+            for node in hierarchy_map['Subcategory']:
+                if 'LabelName' in node:
+                    children_name = node['LabelName']
+                    children_index = self.index_dict[children_name]
+                    children = [children_index]
+                else:
+                    continue
+                if len(parents) > 0:
+                    for parent_index in parents:
+                        if get_all_parents:
+                            children.append(parent_index)
+                        class_label_tree[children_index, parent_index] = 1
+
+                class_label_tree = self._convert_hierarchy_tree(
+                    node, class_label_tree, parents=children)
+
+        return class_label_tree
+
+    def add_supercategory_ann(self, annotations):
+        """Add parent classes of the corresponding class of the ground truth
+        bboxes."""
+        for i, ann in enumerate(annotations):
+            assert len(ann['labels']) == len(ann['bboxes']) == \
+                   len(ann['gt_is_group_ofs'])
+            gt_bboxes = []
+            gt_is_group_ofs = []
+            gt_labels = []
+            for j in range(len(ann['labels'])):
+                label = ann['labels'][j]
+                bbox = ann['bboxes'][j]
+                is_group = ann['gt_is_group_ofs'][j]
+                label = np.where(self.class_label_tree[label])[0]
+                if len(label) > 1:
+                    for k in range(len(label)):
+                        gt_bboxes.append(bbox)
+                        gt_is_group_ofs.append(is_group)
+                        gt_labels.append(label[k])
+                else:
+                    gt_bboxes.append(bbox)
+                    gt_is_group_ofs.append(is_group)
+                    gt_labels.append(label[0])
+            annotations[i] = dict(
+                bboxes=np.array(gt_bboxes).astype(np.float32),
+                labels=np.array(gt_labels).astype(np.int64),
+                bboxes_ignore=ann['bboxes_ignore'],
+                gt_is_group_ofs=np.array(gt_is_group_ofs).astype(bool))
+
+        return annotations
+
+    def process_results(self, det_results, annotations,
+                        image_level_annotations):
+        """Process results of the corresponding class of the detection bboxes.
+
+        Note: It will choose to do the following two processing according to
+        the parameters:
+
+        1. Whether to add parent classes of the corresponding class of the
+        detection bboxes.
+
+        2. Whether to ignore the classes that unannotated on that image.
+        """
+        if image_level_annotations is not None:
+            assert len(annotations) == \
+                   len(image_level_annotations) == \
+                   len(det_results)
+        else:
+            assert len(annotations) == len(det_results)
+        for i in range(len(det_results)):
+            results = copy.deepcopy(det_results[i])
+            valid_classes = np.where(
+                np.array([[bbox.shape[0]] for bbox in det_results[i]]) != 0)[0]
+            if image_level_annotations is not None:
+                labels = annotations[i]['labels']
+                image_level_labels = \
+                    image_level_annotations[i]['image_level_labels']
+                allowed_labeles = np.unique(
+                    np.append(labels, image_level_labels))
+            else:
+                allowed_labeles = np.unique(annotations[i]['labels'])
+
+            for valid_class in valid_classes:
+                det_cls = np.where(self.class_label_tree[valid_class])[0]
+                for index in det_cls:
+                    if index in allowed_labeles and \
+                            index != valid_class and \
+                            self.get_supercategory:
+                        det_results[i][index] = \
+                            np.concatenate((det_results[i][index],
+                                            results[valid_class]))
+                    elif index not in allowed_labeles and self.filter_labels:
+                        # Remove useless parts
+                        det_results[i][index] = np.empty(
+                            (0, 5)).astype(np.float32)
+        return det_results
+
+    def load_image_label_from_csv(self, image_level_ann_file):
+        """Load image level annotations from csv style ann_file.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            defaultdict[list[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): Label id.
+                - `confidence` (float): Labels that are human-verified to be
+                  present in an image have confidence = 1 (positive labels).
+                  Labels that are human-verified to be absent from an image
+                  have confidence = 0 (negative labels). Machine-generated
+                  labels have fractional confidences, generally >= 0.5.
+                  The higher the confidence, the smaller the chance for
+                  the label to be a false positive.
+        """
+
+        item_lists = defaultdict(list)
+        with open(image_level_ann_file, 'r') as f:
+            reader = csv.reader(f)
+            for i, line in enumerate(reader):
+                if i == 0:
+                    continue
+                img_id = line[0]
+                item_lists[img_id].append(
+                    dict(
+                        image_level_label=int(self.index_dict[line[2]]),
+                        confidence=float(line[3])))
+        return item_lists
+
+    def get_image_level_ann(self, image_level_ann_file):
+        """Get OpenImages annotation by index.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        if hasattr(self.file_client, 'get_local_path'):
+            with self.file_client.get_local_path(image_level_ann_file) \
+                    as local_path:
+                item_lists = self.load_image_label_from_csv(local_path)
+        else:
+            item_lists = self.load_image_label_from_csv(image_level_ann_file)
+        image_level_annotations = []
+        for i in range(len(self)):
+            img_info = self.data_infos[i].get('img_info', None)
+            if img_info is not None:
+                # for Open Images Challenges
+                img_id = osp.split(img_info['filename'])[-1][:-4]
+            else:
+                # for Open Images v6
+                img_id = self.data_infos[i]['img_id']
+            item_list = item_lists.get(img_id, None)
+            if item_list is not None:
+                image_level_labels = []
+                confidences = []
+                for obj in item_list:
+                    image_level_label = int(obj['image_level_label'])
+                    confidence = float(obj['confidence'])
+
+                    image_level_labels.append(image_level_label)
+                    confidences.append(confidence)
+
+                if not image_level_labels:
+                    image_level_labels = np.zeros((0, ))
+                    confidences = np.zeros((0, ))
+                else:
+                    image_level_labels = np.array(image_level_labels)
+                    confidences = np.array(confidences)
+            else:
+                image_level_labels = np.zeros((0, ))
+                confidences = np.zeros((0, ))
+            ann = dict(
+                image_level_labels=image_level_labels.astype(np.int64),
+                confidences=confidences.astype(np.float32))
+            image_level_annotations.append(ann)
+
+        return image_level_annotations
+
+    def denormalize_gt_bboxes(self, annotations):
+        """Convert ground truth bboxes from relative position to absolute
+        position.
+
+        Only used in evaluating time.
+        """
+        assert len(self.test_img_shapes) == len(annotations)
+        for i in range(len(annotations)):
+            h, w = self.test_img_shapes[i]
+            annotations[i]['bboxes'][:, 0::2] *= w
+            annotations[i]['bboxes'][:, 1::2] *= h
+        return annotations
+
+    def get_cat_ids(self, idx):
+        """Get category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+        return self.get_ann_info(idx)['labels'].astype(np.int).tolist()
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 iou_thr=0.5,
+                 ioa_thr=0.5,
+                 scale_ranges=None,
+                 denorm_gt_bbox=True,
+                 use_group_of=True):
+        """Evaluate in OpenImages.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Option is
+                 'mAP'. Default: 'mAP'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            iou_thr (float | list[float]): IoU threshold. Default: 0.5.
+            ioa_thr (float | list[float]): IoA threshold. Default: 0.5.
+            scale_ranges (list[tuple], optional): Scale ranges for evaluating
+                mAP. If not specified, all bounding boxes would be included in
+                evaluation. Default: None
+            denorm_gt_bbox (bool): Whether to denorm ground truth bboxes from
+                relative position to absolute position. Default: True
+            use_group_of (bool): Whether consider group of groud truth bboxes
+                during evaluating. Default: True.
+
+        Returns:
+            dict[str, float]: AP metrics.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+
+        if self.load_image_level_labels:
+            image_level_annotations = \
+                self.get_image_level_ann(self.image_level_ann_file)
+        else:
+            image_level_annotations = None
+
+        # load metas from file
+        if self.get_metas and self.load_from_file:
+            assert self.meta_file.endswith(
+                'pkl'), 'File name must be pkl suffix'
+            self.get_meta_from_file(self.meta_file)
+        # load metas from pipeline
+        else:
+            self.get_img_shape(self.test_img_metas)
+
+        if len(self.test_img_shapes) > len(self):
+            self.test_img_shapes = self.test_img_shapes[:len(self)]
+
+        if denorm_gt_bbox:
+            annotations = self.denormalize_gt_bboxes(annotations)
+
+        # Reset test_image_metas, temp_image_metas and test_img_shapes
+        # to avoid potential error
+        self.temp_img_metas = []
+        self.test_img_shapes = []
+        self.test_img_metas = []
+        if self.get_supercategory:
+            annotations = self.add_supercategory_ann(annotations)
+
+        results = self.process_results(results, annotations,
+                                       image_level_annotations)
+        if use_group_of:
+            assert ioa_thr is not None, \
+                'ioa_thr must have value when using group_of in evaluation.'
+
+        eval_results = OrderedDict()
+        iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+        ioa_thrs = [ioa_thr] if isinstance(ioa_thr, float) or ioa_thr is None \
+            else ioa_thr
+
+        # get dataset type
+        if len(self.CLASSES) == 500:
+            ds_name = 'oid_challenge'
+        elif len(self.CLASSES) == 601:
+            ds_name = 'oid_v6'
+        else:
+            ds_name = self.CLASSES
+            warnings.warn('Cannot infer dataset type from the length of the '
+                          'classes. Set `oid_v6` as dataset type.')
+
+        if metric == 'mAP':
+            assert isinstance(iou_thrs, list) and isinstance(ioa_thrs, list)
+            assert len(ioa_thrs) == len(iou_thrs)
+            mean_aps = []
+            for iou_thr, ioa_thr in zip(iou_thrs, ioa_thrs):
+                print_log(f'\n{"-" * 15}iou_thr, ioa_thr: {iou_thr}, {ioa_thr}'
+                          f'{"-" * 15}')
+                mean_ap, _ = eval_map(
+                    results,
+                    annotations,
+                    scale_ranges=scale_ranges,
+                    iou_thr=iou_thr,
+                    ioa_thr=ioa_thr,
+                    dataset=ds_name,
+                    logger=logger,
+                    use_group_of=use_group_of)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+        return eval_results
+
+
+@DATASETS.register_module()
+class OpenImagesChallengeDataset(OpenImagesDataset):
+    """Open Images Challenge dataset for detection."""
+
+    def __init__(self, ann_file, **kwargs):
+        assert ann_file.endswith('txt')
+        super(OpenImagesChallengeDataset, self).__init__(
+            ann_file=ann_file, **kwargs)
+
+    def get_classes_from_csv(self, label_file):
+        """Get classes name from file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            list: Class name of OpenImages.
+        """
+
+        label_list = []
+        id_list = []
+        with open(label_file, 'r') as f:
+            reader = csv.reader(f)
+            for line in reader:
+                label_name = line[0]
+                label_id = int(line[2])
+
+                label_list.append(line[1])
+                id_list.append(label_id)
+                self.index_dict[label_name] = label_id - 1
+
+        indexes = np.argsort(id_list)
+        classes_names = []
+        for index in indexes:
+            classes_names.append(label_list[index])
+        return classes_names
+
+    def load_annotations(self, ann_file):
+        """Load annotation from annotation file."""
+        with open(ann_file) as f:
+            lines = f.readlines()
+        i = 0
+        ann_infos = []
+        while i < len(lines):
+            bboxes = []
+            labels = []
+            is_group_ofs = []
+            filename = lines[i].rstrip()
+            i += 2
+            img_gt_size = int(lines[i])
+            i += 1
+            for j in range(img_gt_size):
+                sp = lines[i + j].split()
+                bboxes.append(
+                    [float(sp[1]),
+                     float(sp[2]),
+                     float(sp[3]),
+                     float(sp[4])])
+                labels.append(int(sp[0]) - 1)  # labels begin from 1
+                is_group_ofs.append(True if int(sp[5]) == 1 else False)
+            i += img_gt_size
+
+            gt_bboxes = np.array(bboxes, dtype=np.float32)
+            gt_labels = np.array(labels, dtype=np.int64)
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+            gt_is_group_ofs = np.array(is_group_ofs, dtype=bool)
+
+            img_info = dict(filename=filename)
+            ann_info = dict(
+                bboxes=gt_bboxes,
+                labels=gt_labels,
+                bboxes_ignore=gt_bboxes_ignore,
+                gt_is_group_ofs=gt_is_group_ofs)
+            ann_infos.append(dict(img_info=img_info, ann_info=ann_info))
+
+        return ann_infos
+
+    def prepare_train_img(self, idx):
+        """Get training data and annotations after pipeline."""
+        ann_info = self.data_infos[idx]
+        results = dict(
+            img_info=ann_info['img_info'],
+            ann_info=ann_info['ann_info'],
+        )
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Get testing data after pipeline."""
+        ann_info = self.data_infos[idx]
+        results = dict(img_info=ann_info['img_info'])
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+
+        results = self.pipeline(results)
+        if self.get_metas and self.load_from_pipeline:
+            self.get_meta_from_pipeline(results)
+        return results
+
+    def get_relation_matrix(self, hierarchy_file):
+        """Get hierarchy for classes.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            ndarray: The matrix of the corresponding
+            relationship between the parent class and the child class,
+            of shape (class_num, class_num).
+        """
+        class_label_tree = np.load(hierarchy_file, allow_pickle=True)
+        return class_label_tree[1:, 1:]
+
+    def get_ann_info(self, idx):
+        """Get OpenImages annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+        # avoid some potential error
+        data_infos = copy.deepcopy(self.data_infos[idx]['ann_info'])
+        return data_infos
+
+    def load_image_label_from_csv(self, image_level_ann_file):
+        """Load image level annotations from csv style ann_file.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            defaultdict[list[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): of shape 1.
+                - `confidence` (float): of shape 1.
+        """
+
+        item_lists = defaultdict(list)
+        with open(image_level_ann_file, 'r') as f:
+            reader = csv.reader(f)
+            i = -1
+            for line in reader:
+                i += 1
+                if i == 0:
+                    continue
+                else:
+                    img_id = line[0]
+                    label_id = line[1]
+                    assert label_id in self.index_dict
+                    image_level_label = int(self.index_dict[label_id])
+                    confidence = float(line[2])
+                    item_lists[img_id].append(
+                        dict(
+                            image_level_label=image_level_label,
+                            confidence=confidence))
+        return item_lists
diff --git a/mmdet/datasets/pipelines/__init__.py b/mmdet/datasets/pipelines/__init__.py
new file mode 100755
index 0000000..8260da6
--- /dev/null
+++ b/mmdet/datasets/pipelines/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .auto_augment import (AutoAugment, BrightnessTransform, ColorTransform,
+                           ContrastTransform, EqualizeTransform, Rotate, Shear,
+                           Translate)
+from .compose import Compose
+from .formatting import (Collect, DefaultFormatBundle, ImageToTensor,
+                         ToDataContainer, ToTensor, Transpose, to_tensor)
+from .instaboost import InstaBoost
+from .loading import (FilterAnnotations, LoadAnnotations, LoadImageFromFile,
+                      LoadImageFromWebcam, LoadMultiChannelImageFromFiles,
+                      LoadPanopticAnnotations, LoadProposals)
+from .test_time_aug import MultiScaleFlipAug
+from .transforms import (Albu, CopyPaste, CutOut, Expand, MinIoURandomCrop,
+                         MixUp, Mosaic, Normalize, Pad, PhotoMetricDistortion,
+                         RandomAffine, RandomCenterCropPad, RandomCrop,
+                         RandomFlip, RandomShift, Resize, SegRescale,
+                         YOLOXHSVRandomAug)
+
+__all__ = [
+    'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
+    'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations',
+    'LoadImageFromFile', 'LoadImageFromWebcam', 'LoadPanopticAnnotations',
+    'LoadMultiChannelImageFromFiles', 'LoadProposals', 'FilterAnnotations',
+    'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop',
+    'Normalize', 'SegRescale', 'MinIoURandomCrop', 'Expand',
+    'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad',
+    'AutoAugment', 'CutOut', 'Shear', 'Rotate', 'ColorTransform',
+    'EqualizeTransform', 'BrightnessTransform', 'ContrastTransform',
+    'Translate', 'RandomShift', 'Mosaic', 'MixUp', 'RandomAffine',
+    'YOLOXHSVRandomAug', 'CopyPaste'
+]
diff --git a/mmdet/datasets/pipelines/auto_augment.py b/mmdet/datasets/pipelines/auto_augment.py
new file mode 100755
index 0000000..b0ff67d
--- /dev/null
+++ b/mmdet/datasets/pipelines/auto_augment.py
@@ -0,0 +1,894 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import cv2
+import mmcv
+import numpy as np
+
+from ..builder import PIPELINES
+from .compose import Compose
+
+_MAX_LEVEL = 10
+
+
+def level_to_value(level, max_value):
+    """Map from level to values based on max_value."""
+    return (level / _MAX_LEVEL) * max_value
+
+
+def enhance_level_to_value(level, a=1.8, b=0.1):
+    """Map from level to values."""
+    return (level / _MAX_LEVEL) * a + b
+
+
+def random_negative(value, random_negative_prob):
+    """Randomly negate value based on random_negative_prob."""
+    return -value if np.random.rand() < random_negative_prob else value
+
+
+def bbox2fields():
+    """The key correspondence from bboxes to labels, masks and
+    segmentations."""
+    bbox2label = {
+        'gt_bboxes': 'gt_labels',
+        'gt_bboxes_ignore': 'gt_labels_ignore'
+    }
+    bbox2mask = {
+        'gt_bboxes': 'gt_masks',
+        'gt_bboxes_ignore': 'gt_masks_ignore'
+    }
+    bbox2seg = {
+        'gt_bboxes': 'gt_semantic_seg',
+    }
+    return bbox2label, bbox2mask, bbox2seg
+
+
+@PIPELINES.register_module()
+class AutoAugment:
+    """Auto augmentation.
+
+    This data augmentation is proposed in `Learning Data Augmentation
+    Strategies for Object Detection <https://arxiv.org/pdf/1906.11172>`_.
+
+    TODO: Implement 'Shear', 'Sharpness' and 'Rotate' transforms
+
+    Args:
+        policies (list[list[dict]]): The policies of auto augmentation. Each
+            policy in ``policies`` is a specific augmentation policy, and is
+            composed by several augmentations (dict). When AutoAugment is
+            called, a random policy in ``policies`` will be selected to
+            augment images.
+
+    Examples:
+        >>> replace = (104, 116, 124)
+        >>> policies = [
+        >>>     [
+        >>>         dict(type='Sharpness', prob=0.0, level=8),
+        >>>         dict(
+        >>>             type='Shear',
+        >>>             prob=0.4,
+        >>>             level=0,
+        >>>             replace=replace,
+        >>>             axis='x')
+        >>>     ],
+        >>>     [
+        >>>         dict(
+        >>>             type='Rotate',
+        >>>             prob=0.6,
+        >>>             level=10,
+        >>>             replace=replace),
+        >>>         dict(type='Color', prob=1.0, level=6)
+        >>>     ]
+        >>> ]
+        >>> augmentation = AutoAugment(policies)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self, policies):
+        assert isinstance(policies, list) and len(policies) > 0, \
+            'Policies must be a non-empty list.'
+        for policy in policies:
+            assert isinstance(policy, list) and len(policy) > 0, \
+                'Each policy in policies must be a non-empty list.'
+            for augment in policy:
+                assert isinstance(augment, dict) and 'type' in augment, \
+                    'Each specific augmentation must be a dict with key' \
+                    ' "type".'
+
+        self.policies = copy.deepcopy(policies)
+        self.transforms = [Compose(policy) for policy in self.policies]
+
+    def __call__(self, results):
+        transform = np.random.choice(self.transforms)
+        return transform(results)
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(policies={self.policies})'
+
+
+@PIPELINES.register_module()
+class Shear:
+    """Apply Shear Transformation to image (and its corresponding bbox, mask,
+    segmentation).
+
+    Args:
+        level (int | float): The level should be in range [0,_MAX_LEVEL].
+        img_fill_val (int | float | tuple): The filled values for image border.
+            If float, the same fill value will be used for all the three
+            channels of image. If tuple, the should be 3 elements.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        prob (float): The probability for performing Shear and should be in
+            range [0, 1].
+        direction (str): The direction for shear, either "horizontal"
+            or "vertical".
+        max_shear_magnitude (float): The maximum magnitude for Shear
+            transformation.
+        random_negative_prob (float): The probability that turns the
+                offset negative. Should be in range [0,1]
+        interpolation (str): Same as in :func:`mmcv.imshear`.
+    """
+
+    def __init__(self,
+                 level,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 prob=0.5,
+                 direction='horizontal',
+                 max_shear_magnitude=0.3,
+                 random_negative_prob=0.5,
+                 interpolation='bilinear'):
+        assert isinstance(level, (int, float)), 'The level must be type ' \
+            f'int or float, got {type(level)}.'
+        assert 0 <= level <= _MAX_LEVEL, 'The level should be in range ' \
+            f'[0,{_MAX_LEVEL}], got {level}.'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, 'img_fill_val as tuple must ' \
+                f'have 3 elements. got {len(img_fill_val)}.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError(
+                'img_fill_val must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), 'all ' \
+            'elements of img_fill_val should between range [0,255].' \
+            f'got {img_fill_val}.'
+        assert 0 <= prob <= 1.0, 'The probability of shear should be in ' \
+            f'range [0,1]. got {prob}.'
+        assert direction in ('horizontal', 'vertical'), 'direction must ' \
+            f'in be either "horizontal" or "vertical". got {direction}.'
+        assert isinstance(max_shear_magnitude, float), 'max_shear_magnitude ' \
+            f'should be type float. got {type(max_shear_magnitude)}.'
+        assert 0. <= max_shear_magnitude <= 1., 'Defaultly ' \
+            'max_shear_magnitude should be in range [0,1]. ' \
+            f'got {max_shear_magnitude}.'
+        self.level = level
+        self.magnitude = level_to_value(level, max_shear_magnitude)
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+        self.direction = direction
+        self.max_shear_magnitude = max_shear_magnitude
+        self.random_negative_prob = random_negative_prob
+        self.interpolation = interpolation
+
+    def _shear_img(self,
+                   results,
+                   magnitude,
+                   direction='horizontal',
+                   interpolation='bilinear'):
+        """Shear the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The direction for shear, either "horizontal"
+                or "vertical".
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            img_sheared = mmcv.imshear(
+                img,
+                magnitude,
+                direction,
+                border_value=self.img_fill_val,
+                interpolation=interpolation)
+            results[key] = img_sheared.astype(img.dtype)
+            results['img_shape'] = results[key].shape
+
+    def _shear_bboxes(self, results, magnitude):
+        """Shear the bboxes."""
+        h, w, c = results['img_shape']
+        if self.direction == 'horizontal':
+            shear_matrix = np.stack([[1, magnitude],
+                                     [0, 1]]).astype(np.float32)  # [2, 2]
+        else:
+            shear_matrix = np.stack([[1, 0], [magnitude,
+                                              1]]).astype(np.float32)
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            coordinates = np.stack([[min_x, min_y], [max_x, min_y],
+                                    [min_x, max_y],
+                                    [max_x, max_y]])  # [4, 2, nb_box, 1]
+            coordinates = coordinates[..., 0].transpose(
+                (2, 1, 0)).astype(np.float32)  # [nb_box, 2, 4]
+            new_coords = np.matmul(shear_matrix[None, :, :],
+                                   coordinates)  # [nb_box, 2, 4]
+            min_x = np.min(new_coords[:, 0, :], axis=-1)
+            min_y = np.min(new_coords[:, 1, :], axis=-1)
+            max_x = np.max(new_coords[:, 0, :], axis=-1)
+            max_y = np.max(new_coords[:, 1, :], axis=-1)
+            min_x = np.clip(min_x, a_min=0, a_max=w)
+            min_y = np.clip(min_y, a_min=0, a_max=h)
+            max_x = np.clip(max_x, a_min=min_x, a_max=w)
+            max_y = np.clip(max_y, a_min=min_y, a_max=h)
+            results[key] = np.stack([min_x, min_y, max_x, max_y],
+                                    axis=-1).astype(results[key].dtype)
+
+    def _shear_masks(self,
+                     results,
+                     magnitude,
+                     direction='horizontal',
+                     fill_val=0,
+                     interpolation='bilinear'):
+        """Shear the masks."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.shear((h, w),
+                                       magnitude,
+                                       direction,
+                                       border_value=fill_val,
+                                       interpolation=interpolation)
+
+    def _shear_seg(self,
+                   results,
+                   magnitude,
+                   direction='horizontal',
+                   fill_val=255,
+                   interpolation='bilinear'):
+        """Shear the segmentation maps."""
+        for key in results.get('seg_fields', []):
+            seg = results[key]
+            results[key] = mmcv.imshear(
+                seg,
+                magnitude,
+                direction,
+                border_value=fill_val,
+                interpolation=interpolation).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_bbox_size=0):
+        """Filter bboxes and corresponding masks too small after shear
+        augmentation."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+
+    def __call__(self, results):
+        """Call function to shear images, bounding boxes, masks and semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Sheared results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        magnitude = random_negative(self.magnitude, self.random_negative_prob)
+        self._shear_img(results, magnitude, self.direction, self.interpolation)
+        self._shear_bboxes(results, magnitude)
+        # fill_val set to 0 for background of mask.
+        self._shear_masks(
+            results,
+            magnitude,
+            self.direction,
+            fill_val=0,
+            interpolation=self.interpolation)
+        self._shear_seg(
+            results,
+            magnitude,
+            self.direction,
+            fill_val=self.seg_ignore_label,
+            interpolation=self.interpolation)
+        self._filter_invalid(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'img_fill_val={self.img_fill_val}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'direction={self.direction}, '
+        repr_str += f'max_shear_magnitude={self.max_shear_magnitude}, '
+        repr_str += f'random_negative_prob={self.random_negative_prob}, '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Rotate:
+    """Apply Rotate Transformation to image (and its corresponding bbox, mask,
+    segmentation).
+
+    Args:
+        level (int | float): The level should be in range (0,_MAX_LEVEL].
+        scale (int | float): Isotropic scale factor. Same in
+            ``mmcv.imrotate``.
+        center (int | float | tuple[float]): Center point (w, h) of the
+            rotation in the source image. If None, the center of the
+            image will be used. Same in ``mmcv.imrotate``.
+        img_fill_val (int | float | tuple): The fill value for image border.
+            If float, the same value will be used for all the three
+            channels of image. If tuple, the should be 3 elements (e.g.
+            equals the number of channels for image).
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1.
+        max_rotate_angle (int | float): The maximum angles for rotate
+            transformation.
+        random_negative_prob (float): The probability that turns the
+             offset negative.
+    """
+
+    def __init__(self,
+                 level,
+                 scale=1,
+                 center=None,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 prob=0.5,
+                 max_rotate_angle=30,
+                 random_negative_prob=0.5):
+        assert isinstance(level, (int, float)), \
+            f'The level must be type int or float. got {type(level)}.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
+        assert isinstance(scale, (int, float)), \
+            f'The scale must be type int or float. got type {type(scale)}.'
+        if isinstance(center, (int, float)):
+            center = (center, center)
+        elif isinstance(center, tuple):
+            assert len(center) == 2, 'center with type tuple must have '\
+                f'2 elements. got {len(center)} elements.'
+        else:
+            assert center is None, 'center must be None or type int, '\
+                f'float or tuple, got type {type(center)}.'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
+                f'have 3 elements. got {len(img_fill_val)}.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError(
+                'img_fill_val must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), \
+            'all elements of img_fill_val should between range [0,255]. '\
+            f'got {img_fill_val}.'
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
+            f'got {prob}.'
+        assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
+            f'should be type int or float. got type {type(max_rotate_angle)}.'
+        self.level = level
+        self.scale = scale
+        # Rotation angle in degrees. Positive values mean
+        # clockwise rotation.
+        self.angle = level_to_value(level, max_rotate_angle)
+        self.center = center
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+        self.max_rotate_angle = max_rotate_angle
+        self.random_negative_prob = random_negative_prob
+
+    def _rotate_img(self, results, angle, center=None, scale=1.0):
+        """Rotate the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            angle (float): Rotation angle in degrees, positive values
+                mean clockwise rotation. Same in ``mmcv.imrotate``.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation. Same in ``mmcv.imrotate``.
+            scale (int | float): Isotropic scale factor. Same in
+                ``mmcv.imrotate``.
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key].copy()
+            img_rotated = mmcv.imrotate(
+                img, angle, center, scale, border_value=self.img_fill_val)
+            results[key] = img_rotated.astype(img.dtype)
+            results['img_shape'] = results[key].shape
+
+    def _rotate_bboxes(self, results, rotate_matrix):
+        """Rotate the bboxes."""
+        h, w, c = results['img_shape']
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            coordinates = np.stack([[min_x, min_y], [max_x, min_y],
+                                    [min_x, max_y],
+                                    [max_x, max_y]])  # [4, 2, nb_bbox, 1]
+            # pad 1 to convert from format [x, y] to homogeneous
+            # coordinates format [x, y, 1]
+            coordinates = np.concatenate(
+                (coordinates,
+                 np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
+                axis=1)  # [4, 3, nb_bbox, 1]
+            coordinates = coordinates.transpose(
+                (2, 0, 1, 3))  # [nb_bbox, 4, 3, 1]
+            rotated_coords = np.matmul(rotate_matrix,
+                                       coordinates)  # [nb_bbox, 4, 2, 1]
+            rotated_coords = rotated_coords[..., 0]  # [nb_bbox, 4, 2]
+            min_x, min_y = np.min(
+                rotated_coords[:, :, 0], axis=1), np.min(
+                    rotated_coords[:, :, 1], axis=1)
+            max_x, max_y = np.max(
+                rotated_coords[:, :, 0], axis=1), np.max(
+                    rotated_coords[:, :, 1], axis=1)
+            min_x, min_y = np.clip(
+                min_x, a_min=0, a_max=w), np.clip(
+                    min_y, a_min=0, a_max=h)
+            max_x, max_y = np.clip(
+                max_x, a_min=min_x, a_max=w), np.clip(
+                    max_y, a_min=min_y, a_max=h)
+            results[key] = np.stack([min_x, min_y, max_x, max_y],
+                                    axis=-1).astype(results[key].dtype)
+
+    def _rotate_masks(self,
+                      results,
+                      angle,
+                      center=None,
+                      scale=1.0,
+                      fill_val=0):
+        """Rotate the masks."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.rotate((h, w), angle, center, scale, fill_val)
+
+    def _rotate_seg(self,
+                    results,
+                    angle,
+                    center=None,
+                    scale=1.0,
+                    fill_val=255):
+        """Rotate the segmentation map."""
+        for key in results.get('seg_fields', []):
+            seg = results[key].copy()
+            results[key] = mmcv.imrotate(
+                seg, angle, center, scale,
+                border_value=fill_val).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_bbox_size=0):
+        """Filter bboxes and corresponding masks too small after rotate
+        augmentation."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+
+    def __call__(self, results):
+        """Call function to rotate images, bounding boxes, masks and semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        h, w = results['img'].shape[:2]
+        center = self.center
+        if center is None:
+            center = ((w - 1) * 0.5, (h - 1) * 0.5)
+        angle = random_negative(self.angle, self.random_negative_prob)
+        self._rotate_img(results, angle, center, self.scale)
+        rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
+        self._rotate_bboxes(results, rotate_matrix)
+        self._rotate_masks(results, angle, center, self.scale, fill_val=0)
+        self._rotate_seg(
+            results, angle, center, self.scale, fill_val=self.seg_ignore_label)
+        self._filter_invalid(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'center={self.center}, '
+        repr_str += f'img_fill_val={self.img_fill_val}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
+        repr_str += f'random_negative_prob={self.random_negative_prob})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Translate:
+    """Translate the images, bboxes, masks and segmentation maps horizontally
+    or vertically.
+
+    Args:
+        level (int | float): The level for Translate and should be in
+            range [0,_MAX_LEVEL].
+        prob (float): The probability for performing translation and
+            should be in range [0, 1].
+        img_fill_val (int | float | tuple): The filled value for image
+            border. If float, the same fill value will be used for all
+            the three channels of image. If tuple, the should be 3
+            elements (e.g. equals the number of channels for image).
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        direction (str): The translate direction, either "horizontal"
+            or "vertical".
+        max_translate_offset (int | float): The maximum pixel's offset for
+            Translate.
+        random_negative_prob (float): The probability that turns the
+            offset negative.
+        min_size (int | float): The minimum pixel for filtering
+            invalid bboxes after the translation.
+    """
+
+    def __init__(self,
+                 level,
+                 prob=0.5,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 direction='horizontal',
+                 max_translate_offset=250.,
+                 random_negative_prob=0.5,
+                 min_size=0):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level used for calculating Translate\'s offset should be ' \
+            'in range [0,_MAX_LEVEL]'
+        assert 0 <= prob <= 1.0, \
+            'The probability of translation should be in range [0, 1].'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, \
+                'img_fill_val as tuple must have 3 elements.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError('img_fill_val must be type float or tuple.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), \
+            'all elements of img_fill_val should between range [0,255].'
+        assert direction in ('horizontal', 'vertical'), \
+            'direction should be "horizontal" or "vertical".'
+        assert isinstance(max_translate_offset, (int, float)), \
+            'The max_translate_offset must be type int or float.'
+        # the offset used for translation
+        self.offset = int(level_to_value(level, max_translate_offset))
+        self.level = level
+        self.prob = prob
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.direction = direction
+        self.max_translate_offset = max_translate_offset
+        self.random_negative_prob = random_negative_prob
+        self.min_size = min_size
+
+    def _translate_img(self, results, offset, direction='horizontal'):
+        """Translate the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key].copy()
+            results[key] = mmcv.imtranslate(
+                img, offset, direction, self.img_fill_val).astype(img.dtype)
+            results['img_shape'] = results[key].shape
+
+    def _translate_bboxes(self, results, offset):
+        """Shift bboxes horizontally or vertically, according to offset."""
+        h, w, c = results['img_shape']
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            if self.direction == 'horizontal':
+                min_x = np.maximum(0, min_x + offset)
+                max_x = np.minimum(w, max_x + offset)
+            elif self.direction == 'vertical':
+                min_y = np.maximum(0, min_y + offset)
+                max_y = np.minimum(h, max_y + offset)
+
+            # the boxes translated outside of image will be filtered along with
+            # the corresponding masks, by invoking ``_filter_invalid``.
+            results[key] = np.concatenate([min_x, min_y, max_x, max_y],
+                                          axis=-1)
+
+    def _translate_masks(self,
+                         results,
+                         offset,
+                         direction='horizontal',
+                         fill_val=0):
+        """Translate masks horizontally or vertically."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.translate((h, w), offset, direction, fill_val)
+
+    def _translate_seg(self,
+                       results,
+                       offset,
+                       direction='horizontal',
+                       fill_val=255):
+        """Translate segmentation maps horizontally or vertically."""
+        for key in results.get('seg_fields', []):
+            seg = results[key].copy()
+            results[key] = mmcv.imtranslate(seg, offset, direction,
+                                            fill_val).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_size=0):
+        """Filter bboxes and masks too small or translated out of image."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_size) & (bbox_h > min_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+        return results
+
+    def __call__(self, results):
+        """Call function to translate images, bounding boxes, masks and
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Translated results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        offset = random_negative(self.offset, self.random_negative_prob)
+        self._translate_img(results, offset, self.direction)
+        self._translate_bboxes(results, offset)
+        # fill_val defaultly 0 for BitmapMasks and None for PolygonMasks.
+        self._translate_masks(results, offset, self.direction)
+        # fill_val set to ``seg_ignore_label`` for the ignored value
+        # of segmentation map.
+        self._translate_seg(
+            results, offset, self.direction, fill_val=self.seg_ignore_label)
+        self._filter_invalid(results, min_size=self.min_size)
+        return results
+
+
+@PIPELINES.register_module()
+class ColorTransform:
+    """Apply Color transformation to image. The bboxes, masks, and
+    segmentations are not modified.
+
+    Args:
+        level (int | float): Should be in range [0,_MAX_LEVEL].
+        prob (float): The probability for performing Color transformation.
+    """
+
+    def __init__(self, level, prob=0.5):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level should be in range [0,_MAX_LEVEL].'
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.level = level
+        self.prob = prob
+        self.factor = enhance_level_to_value(level)
+
+    def _adjust_color_img(self, results, factor=1.0):
+        """Apply Color transformation to image."""
+        for key in results.get('img_fields', ['img']):
+            # NOTE defaultly the image should be BGR format
+            img = results[key]
+            results[key] = mmcv.adjust_color(img, factor).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Color transformation.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Colored results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._adjust_color_img(results, self.factor)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class EqualizeTransform:
+    """Apply Equalize transformation to image. The bboxes, masks and
+    segmentations are not modified.
+
+    Args:
+        prob (float): The probability for performing Equalize transformation.
+    """
+
+    def __init__(self, prob=0.5):
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.prob = prob
+
+    def _imequalize(self, results):
+        """Equalizes the histogram of one image."""
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            results[key] = mmcv.imequalize(img).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Equalize transformation.
+
+        Args:
+            results (dict): Results dict from loading pipeline.
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._imequalize(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob})'
+
+
+@PIPELINES.register_module()
+class BrightnessTransform:
+    """Apply Brightness transformation to image. The bboxes, masks and
+    segmentations are not modified.
+
+    Args:
+        level (int | float): Should be in range [0,_MAX_LEVEL].
+        prob (float): The probability for performing Brightness transformation.
+    """
+
+    def __init__(self, level, prob=0.5):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level should be in range [0,_MAX_LEVEL].'
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.level = level
+        self.prob = prob
+        self.factor = enhance_level_to_value(level)
+
+    def _adjust_brightness_img(self, results, factor=1.0):
+        """Adjust the brightness of image."""
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            results[key] = mmcv.adjust_brightness(img,
+                                                  factor).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Brightness transformation.
+
+        Args:
+            results (dict): Results dict from loading pipeline.
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._adjust_brightness_img(results, self.factor)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ContrastTransform:
+    """Apply Contrast transformation to image. The bboxes, masks and
+    segmentations are not modified.
+
+    Args:
+        level (int | float): Should be in range [0,_MAX_LEVEL].
+        prob (float): The probability for performing Contrast transformation.
+    """
+
+    def __init__(self, level, prob=0.5):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level should be in range [0,_MAX_LEVEL].'
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.level = level
+        self.prob = prob
+        self.factor = enhance_level_to_value(level)
+
+    def _adjust_contrast_img(self, results, factor=1.0):
+        """Adjust the image contrast."""
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            results[key] = mmcv.adjust_contrast(img, factor).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Contrast transformation.
+
+        Args:
+            results (dict): Results dict from loading pipeline.
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._adjust_contrast_img(results, self.factor)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
diff --git a/mmdet/datasets/pipelines/compose.py b/mmdet/datasets/pipelines/compose.py
new file mode 100755
index 0000000..d759220
--- /dev/null
+++ b/mmdet/datasets/pipelines/compose.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+
+from mmcv.utils import build_from_cfg
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose:
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (Sequence[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, collections.abc.Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict')
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            str_ = t.__repr__()
+            if 'Compose(' in str_:
+                str_ = str_.replace('\n', '\n    ')
+            format_string += '\n'
+            format_string += f'    {str_}'
+        format_string += '\n)'
+        return format_string
diff --git a/mmdet/datasets/pipelines/formating.py b/mmdet/datasets/pipelines/formating.py
new file mode 100755
index 0000000..3b3e45a
--- /dev/null
+++ b/mmdet/datasets/pipelines/formating.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+import warnings
+
+from .formatting import *
+
+warnings.warn('DeprecationWarning: mmdet.datasets.pipelines.formating will be '
+              'deprecated, please replace it with '
+              'mmdet.datasets.pipelines.formatting.')
diff --git a/mmdet/datasets/pipelines/formatting.py b/mmdet/datasets/pipelines/formatting.py
new file mode 100755
index 0000000..2e07f38
--- /dev/null
+++ b/mmdet/datasets/pipelines/formatting.py
@@ -0,0 +1,403 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class ToTensor:
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class ImageToTensor:
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        permute the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and permuted to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img).permute(2, 0, 1).contiguous()
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Transpose:
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to transpose the channel order of data in results.
+
+        Args:
+            results (dict): Result dict contains the data to transpose.
+
+        Returns:
+            dict: The result dict contains the data transposed to \
+                ``self.order``.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(keys={self.keys}, order={self.order})'
+
+
+@PIPELINES.register_module()
+class ToDataContainer:
+    """Convert results to :obj:`mmcv.DataContainer` by given fields.
+
+    Args:
+        fields (Sequence[dict]): Each field is a dict like
+            ``dict(key='xxx', **kwargs)``. The ``key`` in result will
+            be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
+            Default: ``(dict(key='img', stack=True), dict(key='gt_bboxes'),
+            dict(key='gt_labels'))``.
+    """
+
+    def __init__(self,
+                 fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
+                         dict(key='gt_labels'))):
+        self.fields = fields
+
+    def __call__(self, results):
+        """Call function to convert data in results to
+        :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted to \
+                :obj:`mmcv.DataContainer`.
+        """
+
+        for field in self.fields:
+            field = field.copy()
+            key = field.pop('key')
+            results[key] = DC(results[key], **field)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle:
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose & to tensor, (2)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+
+    Args:
+        img_to_float (bool): Whether to force the image to be converted to
+            float type. Default: True.
+        pad_val (dict): A dict for padding value in batch collating,
+            the default value is `dict(img=0, masks=0, seg=255)`.
+            Without this argument, the padding value of "gt_semantic_seg"
+            will be set to 0 by default, which should be 255.
+    """
+
+    def __init__(self,
+                 img_to_float=True,
+                 pad_val=dict(img=0, masks=0, seg=255)):
+        self.img_to_float = img_to_float
+        self.pad_val = pad_val
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+
+        if 'img' in results:
+            img = results['img']
+            if self.img_to_float is True and img.dtype == np.uint8:
+                # Normally, image is of uint8 type without normalization.
+                # At this time, it needs to be forced to be converted to
+                # flot32, otherwise the model training and inference
+                # will be wrong. Only used for YOLOX currently .
+                img = img.astype(np.float32)
+            # add default meta keys
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            # To improve the computational speed by by 3-5 times, apply:
+            # If image is not contiguous, use
+            # `numpy.transpose()` followed by `numpy.ascontiguousarray()`
+            # If image is already contiguous, use
+            # `torch.permute()` followed by `torch.contiguous()`
+            # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+            # for more details
+            if not img.flags.c_contiguous:
+                img = np.ascontiguousarray(img.transpose(2, 0, 1))
+                img = to_tensor(img)
+            else:
+                img = to_tensor(img).permute(2, 0, 1).contiguous()
+            results['img'] = DC(
+                img, padding_value=self.pad_val['img'], stack=True)
+        for key in ['proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels']:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(
+                results['gt_masks'],
+                padding_value=self.pad_val['masks'],
+                cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]),
+                padding_value=self.pad_val['seg'],
+                stack=True)
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results['img']
+        results.setdefault('pad_shape', img.shape)
+        results.setdefault('scale_factor', 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            'img_norm_cfg',
+            dict(
+                mean=np.zeros(num_channels, dtype=np.float32),
+                std=np.ones(num_channels, dtype=np.float32),
+                to_rgb=False))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(img_to_float={self.img_to_float})'
+
+
+@PIPELINES.register_module()
+class Collect:
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+
+        - "img_shape": shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - "scale_factor": a float indicating the preprocessing scale
+
+        - "flip": a boolean indicating if image flip transform was used
+
+        - "filename": path to the image file
+
+        - "ori_shape": original shape of the image as a tuple (h, w, c)
+
+        - "pad_shape": image shape after padding
+
+        - "img_norm_cfg": a dict of normalization information:
+
+            - mean - per channel mean subtraction
+            - std - per channel std divisor
+            - to_rgb - bool indicating if bgr was converted to rgb
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape',
+            'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+            'img_norm_cfg')``
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_filename', 'ori_shape',
+                            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                            'flip_direction', 'img_norm_cfg')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:mmcv.DataContainer.
+
+        Args:
+            results (dict): Result dict contains the data to collect.
+
+        Returns:
+            dict: The result dict contains the following keys
+
+                - keys in``self.keys``
+                - ``img_metas``
+        """
+
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            img_meta[key] = results[key]
+        data['img_metas'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class WrapFieldsToLists:
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='Pad', size_divisor=32),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapFieldsToLists')
+        >>> ]
+    """
+
+    def __call__(self, results):
+        """Call function to wrap fields into lists.
+
+        Args:
+            results (dict): Result dict contains the data to wrap.
+
+        Returns:
+            dict: The result dict where value of ``self.keys`` are wrapped \
+                into list.
+        """
+
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
diff --git a/mmdet/datasets/pipelines/instaboost.py b/mmdet/datasets/pipelines/instaboost.py
new file mode 100755
index 0000000..ca10c4c
--- /dev/null
+++ b/mmdet/datasets/pipelines/instaboost.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class InstaBoost:
+    r"""Data augmentation method in `InstaBoost: Boosting Instance
+    Segmentation Via Probability Map Guided Copy-Pasting
+    <https://arxiv.org/abs/1908.07801>`_.
+
+    Refer to https://github.com/GothicAi/Instaboost for implementation details.
+
+    Args:
+        action_candidate (tuple): Action candidates. "normal", "horizontal", \
+            "vertical", "skip" are supported. Default: ('normal', \
+            'horizontal', 'skip').
+        action_prob (tuple): Corresponding action probabilities. Should be \
+            the same length as action_candidate. Default: (1, 0, 0).
+        scale (tuple): (min scale, max scale). Default: (0.8, 1.2).
+        dx (int): The maximum x-axis shift will be (instance width) / dx.
+            Default 15.
+        dy (int): The maximum y-axis shift will be (instance height) / dy.
+            Default 15.
+        theta (tuple): (min rotation degree, max rotation degree). \
+            Default: (-1, 1).
+        color_prob (float): Probability of images for color augmentation.
+            Default 0.5.
+        heatmap_flag (bool): Whether to use heatmap guided. Default False.
+        aug_ratio (float): Probability of applying this transformation. \
+            Default 0.5.
+    """
+
+    def __init__(self,
+                 action_candidate=('normal', 'horizontal', 'skip'),
+                 action_prob=(1, 0, 0),
+                 scale=(0.8, 1.2),
+                 dx=15,
+                 dy=15,
+                 theta=(-1, 1),
+                 color_prob=0.5,
+                 hflag=False,
+                 aug_ratio=0.5):
+        try:
+            import instaboostfast as instaboost
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install instaboostfast" '
+                'to install instaboostfast first for instaboost augmentation.')
+        self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob,
+                                               scale, dx, dy, theta,
+                                               color_prob, hflag)
+        self.aug_ratio = aug_ratio
+
+    def _load_anns(self, results):
+        labels = results['ann_info']['labels']
+        masks = results['ann_info']['masks']
+        bboxes = results['ann_info']['bboxes']
+        n = len(labels)
+
+        anns = []
+        for i in range(n):
+            label = labels[i]
+            bbox = bboxes[i]
+            mask = masks[i]
+            x1, y1, x2, y2 = bbox
+            # assert (x2 - x1) >= 1 and (y2 - y1) >= 1
+            bbox = [x1, y1, x2 - x1, y2 - y1]
+            anns.append({
+                'category_id': label,
+                'segmentation': mask,
+                'bbox': bbox
+            })
+
+        return anns
+
+    def _parse_anns(self, results, anns, img):
+        gt_bboxes = []
+        gt_labels = []
+        gt_masks_ann = []
+        for ann in anns:
+            x1, y1, w, h = ann['bbox']
+            # TODO: more essential bug need to be fixed in instaboost
+            if w <= 0 or h <= 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            gt_bboxes.append(bbox)
+            gt_labels.append(ann['category_id'])
+            gt_masks_ann.append(ann['segmentation'])
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+        results['ann_info']['labels'] = gt_labels
+        results['ann_info']['bboxes'] = gt_bboxes
+        results['ann_info']['masks'] = gt_masks_ann
+        results['img'] = img
+        return results
+
+    def __call__(self, results):
+        img = results['img']
+        ori_type = img.dtype
+        anns = self._load_anns(results)
+        if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
+            try:
+                import instaboostfast as instaboost
+            except ImportError:
+                raise ImportError('Please run "pip install instaboostfast" '
+                                  'to install instaboostfast first.')
+            anns, img = instaboost.get_new_data(
+                anns, img.astype(np.uint8), self.cfg, background=None)
+
+        results = self._parse_anns(results, anns, img.astype(ori_type))
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(cfg={self.cfg}, aug_ratio={self.aug_ratio})'
+        return repr_str
diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py
new file mode 100755
index 0000000..8af8cf3
--- /dev/null
+++ b/mmdet/datasets/pipelines/loading.py
@@ -0,0 +1,645 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+
+from mmdet.core import BitmapMasks, PolygonMasks
+from ..builder import PIPELINES
+
+try:
+    from panopticapi.utils import rgb2id
+except ImportError:
+    rgb2id = None
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile:
+    """Load an image from file.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename"). Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 channel_order='bgr',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.channel_order = channel_order
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results['img_prefix'] is not None:
+            filename = osp.join(results['img_prefix'],
+                                results['img_info']['filename'])
+        else:
+            filename = results['img_info']['filename']
+
+        img_bytes = self.file_client.get(filename)
+        img = mmcv.imfrombytes(
+            img_bytes, flag=self.color_type, channel_order=self.channel_order)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['img_info']['filename']
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        results['img_fields'] = ['img']
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"channel_order='{self.channel_order}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadImageFromWebcam(LoadImageFromFile):
+    """Load an image from webcam.
+
+    Similar with :obj:`LoadImageFromFile`, but the image read from webcam is in
+    ``results['img']``.
+    """
+
+    def __call__(self, results):
+        """Call functions to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = None
+        results['ori_filename'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        results['img_fields'] = ['img']
+        return results
+
+
+@PIPELINES.register_module()
+class LoadMultiChannelImageFromFiles:
+    """Load multi-channel images from a list of separate channel files.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename", which is expected to be a list of filenames).
+    Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='unchanged',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        """Call functions to load multiple images and get images meta
+        information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded images and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results['img_prefix'] is not None:
+            filename = [
+                osp.join(results['img_prefix'], fname)
+                for fname in results['img_info']['filename']
+            ]
+        else:
+            filename = results['img_info']['filename']
+
+        img = []
+        for name in filename:
+            img_bytes = self.file_client.get(name)
+            img.append(mmcv.imfrombytes(img_bytes, flag=self.color_type))
+        img = np.stack(img, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['img_info']['filename']
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations:
+    """Load multiple types of annotations.
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+             Default: True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Default: True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Default: False.
+        poly2mask (bool): Whether to convert the instance masks from polygons
+            to bitmaps. Default: True.
+        denorm_bbox (bool): Whether to convert bbox from relative value to
+            absolute value. Only used in OpenImage Dataset.
+            Default: False.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_mask=False,
+                 with_seg=False,
+                 poly2mask=True,
+                 denorm_bbox=False,
+                 file_client_args=dict(backend='disk')):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+        self.denorm_bbox = denorm_bbox
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_bboxes(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_bboxes'] = ann_info['bboxes'].copy()
+
+        if self.denorm_bbox:
+            bbox_num = results['gt_bboxes'].shape[0]
+            if bbox_num != 0:
+                h, w = results['img_shape'][:2]
+                results['gt_bboxes'][:, 0::2] *= w
+                results['gt_bboxes'][:, 1::2] *= h
+
+        gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+        if gt_bboxes_ignore is not None:
+            results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
+            results['bbox_fields'].append('gt_bboxes_ignore')
+        results['bbox_fields'].append('gt_bboxes')
+
+        gt_is_group_ofs = ann_info.get('gt_is_group_ofs', None)
+        if gt_is_group_ofs is not None:
+            results['gt_is_group_ofs'] = gt_is_group_ofs.copy()
+
+        return results
+
+    def _load_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+
+        results['gt_labels'] = results['ann_info']['labels'].copy()
+        return results
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def process_polygons(self, polygons):
+        """Convert polygons to list of ndarray and filter invalid polygons.
+
+        Args:
+            polygons (list[list]): Polygons of one instance.
+
+        Returns:
+            list[numpy.ndarray]: Processed polygons.
+        """
+
+        polygons = [np.array(p) for p in polygons]
+        valid_polygons = []
+        for polygon in polygons:
+            if len(polygon) % 2 == 0 and len(polygon) >= 6:
+                valid_polygons.append(polygon)
+        return valid_polygons
+
+    def _load_masks(self, results):
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded mask annotations.
+                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
+                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
+        """
+
+        h, w = results['img_info']['height'], results['img_info']['width']
+        gt_masks = results['ann_info']['masks']
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            gt_masks = PolygonMasks(
+                [self.process_polygons(polygons) for polygons in gt_masks], h,
+                w)
+        results['gt_masks'] = gt_masks
+        results['mask_fields'].append('gt_masks')
+        return results
+
+    def _load_semantic_seg(self, results):
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        filename = osp.join(results['seg_prefix'],
+                            results['ann_info']['seg_map'])
+        img_bytes = self.file_client.get(filename)
+        results['gt_semantic_seg'] = mmcv.imfrombytes(
+            img_bytes, flag='unchanged').squeeze()
+        results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f'file_client_args={self.file_client_args})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadPanopticAnnotations(LoadAnnotations):
+    """Load multiple types of panoptic annotations.
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+             Default: True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Default: True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: True.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Default: True.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_mask=True,
+                 with_seg=True,
+                 file_client_args=dict(backend='disk')):
+        if rgb2id is None:
+            raise RuntimeError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+
+        super(LoadPanopticAnnotations, self).__init__(
+            with_bbox=with_bbox,
+            with_label=with_label,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            poly2mask=True,
+            denorm_bbox=False,
+            file_client_args=file_client_args)
+
+    def _load_masks_and_semantic_segs(self, results):
+        """Private function to load mask and semantic segmentation annotations.
+
+        In gt_semantic_seg, the foreground label is from `0` to
+        `num_things - 1`, the background label is from `num_things` to
+        `num_things + num_stuff - 1`, 255 means the ignored label (`VOID`).
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded mask and semantic segmentation
+                annotations. `BitmapMasks` is used for mask annotations.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        filename = osp.join(results['seg_prefix'],
+                            results['ann_info']['seg_map'])
+        img_bytes = self.file_client.get(filename)
+        pan_png = mmcv.imfrombytes(
+            img_bytes, flag='color', channel_order='rgb').squeeze()
+        pan_png = rgb2id(pan_png)
+
+        gt_masks = []
+        gt_seg = np.zeros_like(pan_png) + 255  # 255 as ignore
+
+        for mask_info in results['ann_info']['masks']:
+            mask = (pan_png == mask_info['id'])
+            gt_seg = np.where(mask, mask_info['category'], gt_seg)
+
+            # The legal thing masks
+            if mask_info.get('is_thing'):
+                gt_masks.append(mask.astype(np.uint8))
+
+        if self.with_mask:
+            h, w = results['img_info']['height'], results['img_info']['width']
+            gt_masks = BitmapMasks(gt_masks, h, w)
+            results['gt_masks'] = gt_masks
+            results['mask_fields'].append('gt_masks')
+
+        if self.with_seg:
+            results['gt_semantic_seg'] = gt_seg
+            results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types panoptic annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_mask or self.with_seg:
+            # The tasks completed by '_load_masks' and '_load_semantic_segs'
+            # in LoadAnnotations are merged to one function.
+            results = self._load_masks_and_semantic_segs(results)
+
+        return results
+
+
+@PIPELINES.register_module()
+class LoadProposals:
+    """Load proposal pipeline.
+
+    Required key is "proposals". Updated keys are "proposals", "bbox_fields".
+
+    Args:
+        num_max_proposals (int, optional): Maximum number of proposals to load.
+            If not specified, all proposals will be loaded.
+    """
+
+    def __init__(self, num_max_proposals=None):
+        self.num_max_proposals = num_max_proposals
+
+    def __call__(self, results):
+        """Call function to load proposals from file.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded proposal annotations.
+        """
+
+        proposals = results['proposals']
+        if proposals.shape[1] not in (4, 5):
+            raise AssertionError(
+                'proposals should have shapes (n, 4) or (n, 5), '
+                f'but found {proposals.shape}')
+        proposals = proposals[:, :4]
+
+        if self.num_max_proposals is not None:
+            proposals = proposals[:self.num_max_proposals]
+
+        if len(proposals) == 0:
+            proposals = np.array([[0, 0, 0, 0]], dtype=np.float32)
+        results['proposals'] = proposals
+        results['bbox_fields'].append('proposals')
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(num_max_proposals={self.num_max_proposals})'
+
+
+@PIPELINES.register_module()
+class FilterAnnotations:
+    """Filter invalid annotations.
+
+    Args:
+        min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth
+            boxes. Default: (1., 1.)
+        min_gt_mask_area (int): Minimum foreground area of ground truth masks.
+            Default: 1
+        by_box (bool): Filter instances with bounding boxes not meeting the
+            min_gt_bbox_wh threshold. Default: True
+        by_mask (bool): Filter instances with masks not meeting
+            min_gt_mask_area threshold. Default: False
+        keep_empty (bool): Whether to return None when it
+            becomes an empty bbox after filtering. Default: True
+    """
+
+    def __init__(self,
+                 min_gt_bbox_wh=(1., 1.),
+                 min_gt_mask_area=1,
+                 by_box=True,
+                 by_mask=False,
+                 keep_empty=True):
+        # TODO: add more filter options
+        assert by_box or by_mask
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+        self.min_gt_mask_area = min_gt_mask_area
+        self.by_box = by_box
+        self.by_mask = by_mask
+        self.keep_empty = keep_empty
+
+    def __call__(self, results):
+        if self.by_box:
+            assert 'gt_bboxes' in results
+            gt_bboxes = results['gt_bboxes']
+            instance_num = gt_bboxes.shape[0]
+        if self.by_mask:
+            assert 'gt_masks' in results
+            gt_masks = results['gt_masks']
+            instance_num = len(gt_masks)
+
+        if instance_num == 0:
+            return results
+
+        tests = []
+        if self.by_box:
+            w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+            h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+            tests.append((w > self.min_gt_bbox_wh[0])
+                         & (h > self.min_gt_bbox_wh[1]))
+        if self.by_mask:
+            gt_masks = results['gt_masks']
+            tests.append(gt_masks.areas >= self.min_gt_mask_area)
+
+        keep = tests[0]
+        for t in tests[1:]:
+            keep = keep & t
+
+        keep = keep.nonzero()[0]
+
+        keys = ('gt_bboxes', 'gt_labels', 'gt_masks')
+        for key in keys:
+            if key in results:
+                results[key] = results[key][keep]
+        if keep.size == 0:
+            if self.keep_empty:
+                return None
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(min_gt_bbox_wh={self.min_gt_bbox_wh},' \
+            f'min_gt_mask_area={self.min_gt_mask_area},' \
+            f'by_box={self.by_box},' \
+            f'by_mask={self.by_mask},' \
+            f'always_keep={self.always_keep})'
diff --git a/mmdet/datasets/pipelines/test_time_aug.py b/mmdet/datasets/pipelines/test_time_aug.py
new file mode 100755
index 0000000..5f1ab7b
--- /dev/null
+++ b/mmdet/datasets/pipelines/test_time_aug.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+
+from ..builder import PIPELINES
+from .compose import Compose
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug:
+    """Test-time augmentation with multiple scales and flipping.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        img_scale=[(1333, 400), (1333, 800)],
+        flip=True,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ]
+
+    After MultiScaleFLipAug with above configuration, the results are wrapped
+    into lists of the same length as followed:
+
+    .. code-block::
+
+        dict(
+            img=[...],
+            img_shape=[...],
+            scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
+            flip=[False, True, False, True]
+            ...
+        )
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple] | None): Images scales for resizing.
+        scale_factor (float | list[float] | None): Scale factors for resizing.
+        flip (bool): Whether apply flip augmentation. Default: False.
+        flip_direction (str | list[str]): Flip augmentation directions,
+            options are "horizontal", "vertical" and "diagonal". If
+            flip_direction is a list, multiple flip augmentations will be
+            applied. It has no effect when flip == False. Default:
+            "horizontal".
+    """
+
+    def __init__(self,
+                 transforms,
+                 img_scale=None,
+                 scale_factor=None,
+                 flip=False,
+                 flip_direction='horizontal'):
+        self.transforms = Compose(transforms)
+        assert (img_scale is None) ^ (scale_factor is None), (
+            'Must have but only one variable can be set')
+        if img_scale is not None:
+            self.img_scale = img_scale if isinstance(img_scale,
+                                                     list) else [img_scale]
+            self.scale_key = 'scale'
+            assert mmcv.is_list_of(self.img_scale, tuple)
+        else:
+            self.img_scale = scale_factor if isinstance(
+                scale_factor, list) else [scale_factor]
+            self.scale_key = 'scale_factor'
+
+        self.flip = flip
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmcv.is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip
+                and not any([t['type'] == 'RandomFlip' for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def __call__(self, results):
+        """Call function to apply test time augment transforms on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+           dict[str: list]: The augmented data, where each value is wrapped
+               into a list.
+        """
+
+        aug_data = []
+        flip_args = [(False, None)]
+        if self.flip:
+            flip_args += [(True, direction)
+                          for direction in self.flip_direction]
+        for scale in self.img_scale:
+            for flip, direction in flip_args:
+                _results = results.copy()
+                _results[self.scale_key] = scale
+                _results['flip'] = flip
+                _results['flip_direction'] = direction
+                data = self.transforms(_results)
+                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+        repr_str += f'flip_direction={self.flip_direction})'
+        return repr_str
diff --git a/mmdet/datasets/pipelines/transforms.py b/mmdet/datasets/pipelines/transforms.py
new file mode 100755
index 0000000..ca71593
--- /dev/null
+++ b/mmdet/datasets/pipelines/transforms.py
@@ -0,0 +1,3022 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import math
+import warnings
+
+import cv2
+import mmcv
+import numpy as np
+from numpy import random
+
+from mmdet.core import BitmapMasks, PolygonMasks, find_inside_bboxes
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.utils import log_img_scale
+from ..builder import PIPELINES
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+
+@PIPELINES.register_module()
+class Resize:
+    """Resize images & bbox & mask.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 interpolation='bilinear',
+                 override=False):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.interpolation = interpolation
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in results.get('img_fields', ['img']):
+            if self.keep_ratio:
+                img, scale_factor = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+            results[key] = img
+
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
+            results['img_shape'] = img.shape
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in results.get('mask_fields', []):
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['img_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results[key] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['img'].shape[:2]
+                scale_factor = results['scale_factor']
+                assert isinstance(scale_factor, float)
+                results['scale'] = tuple(
+                    [int(x * scale_factor) for x in img_shape][::-1])
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip:
+    """Flip the image & bbox & mask.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    When random flip is enabled, ``flip_ratio``/``direction`` can either be a
+    float/string or tuple of float/string. There are 3 flip modes:
+
+    - ``flip_ratio`` is float, ``direction`` is string: the image will be
+        ``direction``ly flipped with probability of ``flip_ratio`` .
+        E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
+        then image will be horizontally flipped with probability of 0.5.
+    - ``flip_ratio`` is float, ``direction`` is list of string: the image will
+        be ``direction[i]``ly flipped with probability of
+        ``flip_ratio/len(direction)``.
+        E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
+        then image will be horizontally flipped with probability of 0.25,
+        vertically with probability of 0.25.
+    - ``flip_ratio`` is list of float, ``direction`` is list of string:
+        given ``len(flip_ratio) == len(direction)``, the image will
+        be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
+        E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
+        'vertical']``, then image will be horizontally flipped with probability
+        of 0.3, vertically with probability of 0.5.
+
+    Args:
+        flip_ratio (float | list[float], optional): The flipping probability.
+            Default: None.
+        direction(str | list[str], optional): The flipping direction. Options
+            are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
+            If input is a list, the length must equal ``flip_ratio``. Each
+            element in ``flip_ratio`` indicates the flip probability of
+            corresponding direction.
+    """
+
+    def __init__(self, flip_ratio=None, direction='horizontal'):
+        if isinstance(flip_ratio, list):
+            assert mmcv.is_list_of(flip_ratio, float)
+            assert 0 <= sum(flip_ratio) <= 1
+        elif isinstance(flip_ratio, float):
+            assert 0 <= flip_ratio <= 1
+        elif flip_ratio is None:
+            pass
+        else:
+            raise ValueError('flip_ratios must be None, float, '
+                             'or list of float')
+        self.flip_ratio = flip_ratio
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert mmcv.is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError('direction must be either str or list of str')
+        self.direction = direction
+
+        if isinstance(flip_ratio, list):
+            assert len(self.flip_ratio) == len(self.direction)
+
+    def bbox_flip(self, bboxes, img_shape, direction):
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        if direction == 'horizontal':
+            w = img_shape[1]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            h = img_shape[0]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            w = img_shape[1]
+            h = img_shape[0]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(f"Invalid flipping direction '{direction}'")
+        return flipped
+
+    def __call__(self, results):
+        """Call function to flip bounding boxes, masks, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction' keys are added \
+                into result dict.
+        """
+
+        if 'flip' not in results:
+            if isinstance(self.direction, list):
+                # None means non-flip
+                direction_list = self.direction + [None]
+            else:
+                # None means non-flip
+                direction_list = [self.direction, None]
+
+            if isinstance(self.flip_ratio, list):
+                non_flip_ratio = 1 - sum(self.flip_ratio)
+                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
+            else:
+                non_flip_ratio = 1 - self.flip_ratio
+                # exclude non-flip
+                single_ratio = self.flip_ratio / (len(direction_list) - 1)
+                flip_ratio_list = [single_ratio] * (len(direction_list) -
+                                                    1) + [non_flip_ratio]
+
+            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
+
+            results['flip'] = cur_dir is not None
+        if 'flip_direction' not in results:
+            results['flip_direction'] = cur_dir
+        if results['flip']:
+            # flip image
+            for key in results.get('img_fields', ['img']):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+            # flip bboxes
+            for key in results.get('bbox_fields', []):
+                results[key] = self.bbox_flip(results[key],
+                                              results['img_shape'],
+                                              results['flip_direction'])
+            # flip masks
+            for key in results.get('mask_fields', []):
+                results[key] = results[key].flip(results['flip_direction'])
+
+            # flip segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
+
+
+@PIPELINES.register_module()
+class RandomShift:
+    """Shift the image and box given shift pixels and probability.
+
+    Args:
+        shift_ratio (float): Probability of shifts. Default 0.5.
+        max_shift_px (int): The max pixels for shifting. Default 32.
+        filter_thr_px (int): The width and height threshold for filtering.
+            The bbox and the rest of the targets below the width and
+            height threshold will be filtered. Default 1.
+    """
+
+    def __init__(self, shift_ratio=0.5, max_shift_px=32, filter_thr_px=1):
+        assert 0 <= shift_ratio <= 1
+        assert max_shift_px >= 0
+        self.shift_ratio = shift_ratio
+        self.max_shift_px = max_shift_px
+        self.filter_thr_px = int(filter_thr_px)
+        # The key correspondence from bboxes to labels.
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to random shift images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Shift results.
+        """
+        if random.random() < self.shift_ratio:
+            img_shape = results['img'].shape[:2]
+
+            random_shift_x = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            random_shift_y = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            new_x = max(0, random_shift_x)
+            ori_x = max(0, -random_shift_x)
+            new_y = max(0, random_shift_y)
+            ori_y = max(0, -random_shift_y)
+
+            # TODO: support mask and semantic segmentation maps.
+            for key in results.get('bbox_fields', []):
+                bboxes = results[key].copy()
+                bboxes[..., 0::2] += random_shift_x
+                bboxes[..., 1::2] += random_shift_y
+
+                # clip border
+                bboxes[..., 0::2] = np.clip(bboxes[..., 0::2], 0, img_shape[1])
+                bboxes[..., 1::2] = np.clip(bboxes[..., 1::2], 0, img_shape[0])
+
+                # remove invalid bboxes
+                bbox_w = bboxes[..., 2] - bboxes[..., 0]
+                bbox_h = bboxes[..., 3] - bboxes[..., 1]
+                valid_inds = (bbox_w > self.filter_thr_px) & (
+                    bbox_h > self.filter_thr_px)
+                # If the shift does not contain any gt-bbox area, skip this
+                # image.
+                if key == 'gt_bboxes' and not valid_inds.any():
+                    return results
+                bboxes = bboxes[valid_inds]
+                results[key] = bboxes
+
+                # label fields. e.g. gt_labels and gt_labels_ignore
+                label_key = self.bbox2label.get(key)
+                if label_key in results:
+                    results[label_key] = results[label_key][valid_inds]
+
+            for key in results.get('img_fields', ['img']):
+                img = results[key]
+                new_img = np.zeros_like(img)
+                img_h, img_w = img.shape[:2]
+                new_h = img_h - np.abs(random_shift_y)
+                new_w = img_w - np.abs(random_shift_x)
+                new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
+                    = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w]
+                results[key] = new_img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_shift_px={self.max_shift_px}, '
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Pad:
+    """Pad the image & masks & segmentation map.
+
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_to_square (bool): Whether to pad the image into a square.
+            Currently only used for YOLOX. Default: False.
+        pad_val (dict, optional): A dict for padding value, the default
+            value is `dict(img=0, masks=0, seg=255)`.
+    """
+
+    def __init__(self,
+                 size=None,
+                 size_divisor=None,
+                 pad_to_square=False,
+                 pad_val=dict(img=0, masks=0, seg=255)):
+        self.size = size
+        self.size_divisor = size_divisor
+        if isinstance(pad_val, float) or isinstance(pad_val, int):
+            warnings.warn(
+                'pad_val of float type is deprecated now, '
+                f'please use pad_val=dict(img={pad_val}, '
+                f'masks={pad_val}, seg=255) instead.', DeprecationWarning)
+            pad_val = dict(img=pad_val, masks=pad_val, seg=255)
+        assert isinstance(pad_val, dict)
+        self.pad_val = pad_val
+        self.pad_to_square = pad_to_square
+
+        if pad_to_square:
+            assert size is None and size_divisor is None, \
+                'The size and size_divisor must be None ' \
+                'when pad2square is True'
+        else:
+            assert size is not None or size_divisor is not None, \
+                'only one of size and size_divisor should be valid'
+            assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        pad_val = self.pad_val.get('img', 0)
+        for key in results.get('img_fields', ['img']):
+            if self.pad_to_square:
+                max_size = max(results[key].shape[:2])
+                self.size = (max_size, max_size)
+            if self.size is not None:
+                padded_img = mmcv.impad(
+                    results[key], shape=self.size, pad_val=pad_val)
+            elif self.size_divisor is not None:
+                padded_img = mmcv.impad_to_multiple(
+                    results[key], self.size_divisor, pad_val=pad_val)
+            results[key] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def _pad_masks(self, results):
+        """Pad masks according to ``results['pad_shape']``."""
+        pad_shape = results['pad_shape'][:2]
+        pad_val = self.pad_val.get('masks', 0)
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].pad(pad_shape, pad_val=pad_val)
+
+    def _pad_seg(self, results):
+        """Pad semantic segmentation map according to
+        ``results['pad_shape']``."""
+        pad_val = self.pad_val.get('seg', 255)
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.impad(
+                results[key], shape=results['pad_shape'][:2], pad_val=pad_val)
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_masks(results)
+        self._pad_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_to_square={self.pad_to_square}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Normalize:
+    """Normalize the image.
+
+    Added key is "img_norm_cfg".
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        for key in results.get('img_fields', ['img']):
+            results[key] = mmcv.imnormalize(results[key], self.mean, self.std,
+                                            self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCrop:
+    """Random crop the image & bboxes & masks.
+
+    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
+    then the cropped results are generated.
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            height and width.
+        crop_type (str, optional): one of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])]. Default "absolute".
+        allow_negative_crop (bool, optional): Whether to allow a crop that does
+            not contain any bbox area. Default False.
+        recompute_bbox (bool, optional): Whether to re-compute the boxes based
+            on cropped instance masks. Default False.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+            original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
+          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
+          `gt_masks_ignore`.
+        - If the crop does not contain any gt-bbox region and
+          `allow_negative_crop` is set to False, skip this image.
+    """
+
+    def __init__(self,
+                 crop_size,
+                 crop_type='absolute',
+                 allow_negative_crop=False,
+                 recompute_bbox=False,
+                 bbox_clip_border=True):
+        if crop_type not in [
+                'relative_range', 'relative', 'absolute', 'absolute_range'
+        ]:
+            raise ValueError(f'Invalid crop_type {crop_type}.')
+        if crop_type in ['absolute', 'absolute_range']:
+            assert crop_size[0] > 0 and crop_size[1] > 0
+            assert isinstance(crop_size[0], int) and isinstance(
+                crop_size[1], int)
+        else:
+            assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+        self.crop_size = crop_size
+        self.crop_type = crop_type
+        self.allow_negative_crop = allow_negative_crop
+        self.bbox_clip_border = bbox_clip_border
+        self.recompute_bbox = recompute_bbox
+        # The key correspondence from bboxes to labels and masks.
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def _crop_data(self, results, crop_size, allow_negative_crop):
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (tuple): Expected absolute size after cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area. Default to False.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            margin_h = max(img.shape[0] - crop_size[0], 0)
+            margin_w = max(img.shape[1] - crop_size[1], 0)
+            offset_h = np.random.randint(0, margin_h + 1)
+            offset_w = np.random.randint(0, margin_w + 1)
+            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+            # crop the image
+            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+            img_shape = img.shape
+            results[key] = img
+        results['img_shape'] = img_shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            # e.g. gt_bboxes and gt_bboxes_ignore
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            if self.bbox_clip_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+                bboxes[:, 3] > bboxes[:, 1])
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (key == 'gt_bboxes' and not valid_inds.any()
+                    and not allow_negative_crop):
+                return None
+            results[key] = bboxes[valid_inds, :]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = self.bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = self.bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results[key] = results[mask_key].get_bboxes()
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+        return results
+
+    def _get_crop_size(self, image_size):
+        """Randomly generates the absolute crop size based on `crop_type` and
+        `image_size`.
+
+        Args:
+            image_size (tuple): (h, w).
+
+        Returns:
+            crop_size (tuple): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        if self.crop_type == 'absolute':
+            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+        elif self.crop_type == 'absolute_range':
+            assert self.crop_size[0] <= self.crop_size[1]
+            crop_h = np.random.randint(
+                min(h, self.crop_size[0]),
+                min(h, self.crop_size[1]) + 1)
+            crop_w = np.random.randint(
+                min(w, self.crop_size[0]),
+                min(w, self.crop_size[1]) + 1)
+            return crop_h, crop_w
+        elif self.crop_type == 'relative':
+            crop_h, crop_w = self.crop_size
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+        elif self.crop_type == 'relative_range':
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+    def __call__(self, results):
+        """Call function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        image_size = results['img'].shape[:2]
+        crop_size = self._get_crop_size(image_size)
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class SegRescale:
+    """Rescale semantic segmentation maps.
+
+    Args:
+        scale_factor (float): The scale factor of the final output.
+        backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+    """
+
+    def __init__(self, scale_factor=1, backend='cv2'):
+        self.scale_factor = scale_factor
+        self.backend = backend
+
+    def __call__(self, results):
+        """Call function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+
+        for key in results.get('seg_fields', []):
+            if self.scale_factor != 1:
+                results[key] = mmcv.imrescale(
+                    results[key],
+                    self.scale_factor,
+                    interpolation='nearest',
+                    backend=self.backend)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortion:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        img = img.astype(np.float32)
+        # random brightness
+        if random.randint(2):
+            delta = random.uniform(-self.brightness_delta,
+                                   self.brightness_delta)
+            img += delta
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+
+        # random saturation
+        if random.randint(2):
+            img[..., 1] *= random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+
+        # random hue
+        if random.randint(2):
+            img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # randomly swap channels
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Expand:
+    """Random expand the image & bboxes.
+
+    Randomly place the original image on a canvas of 'ratio' x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+
+    Args:
+        mean (tuple): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (tuple): range of expand ratio.
+        prob (float): probability of applying this transformation
+    """
+
+    def __init__(self,
+                 mean=(0, 0, 0),
+                 to_rgb=True,
+                 ratio_range=(1, 4),
+                 seg_ignore_label=None,
+                 prob=0.5):
+        self.to_rgb = to_rgb
+        self.ratio_range = ratio_range
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+
+    def __call__(self, results):
+        """Call function to expand images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images, bounding boxes expanded
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+
+        h, w, c = img.shape
+        ratio = random.uniform(self.min_ratio, self.max_ratio)
+        # speedup expand when meets large image
+        if np.all(self.mean == self.mean[0]):
+            expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+                                  img.dtype)
+            expand_img.fill(self.mean[0])
+        else:
+            expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                                 self.mean,
+                                 dtype=img.dtype)
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        expand_img[top:top + h, left:left + w] = img
+
+        results['img'] = expand_img
+        # expand bboxes
+        for key in results.get('bbox_fields', []):
+            results[key] = results[key] + np.tile(
+                (left, top), 2).astype(results[key].dtype)
+
+        # expand masks
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].expand(
+                int(h * ratio), int(w * ratio), top, left)
+
+        # expand segs
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+                                    self.seg_ignore_label,
+                                    dtype=gt_seg.dtype)
+            expand_gt_seg[top:top + h, left:left + w] = gt_seg
+            results[key] = expand_gt_seg
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MinIoURandomCrop:
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        The keys for bboxes, labels and masks should be paired. That is, \
+        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+    """
+
+    def __init__(self,
+                 min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size=0.3,
+                 bbox_clip_border=True):
+        # 1: return ori img
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.bbox_clip_border = bbox_clip_border
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert 'bbox_fields' in results
+        boxes = [results[key] for key in results['bbox_fields']]
+        boxes = np.concatenate(boxes, 0)
+        h, w, c = img.shape
+        while True:
+            mode = random.choice(self.sample_mode)
+            self.mode = mode
+            if mode == 1:
+                return results
+
+            min_iou = mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = bbox_overlaps(
+                    patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                        mask = ((center[:, 0] > patch[0]) *
+                                (center[:, 1] > patch[1]) *
+                                (center[:, 0] < patch[2]) *
+                                (center[:, 1] < patch[3]))
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                    for key in results.get('bbox_fields', []):
+                        boxes = results[key].copy()
+                        mask = is_center_of_bboxes_in_patch(boxes, patch)
+                        boxes = boxes[mask]
+                        if self.bbox_clip_border:
+                            boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                            boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                        boxes -= np.tile(patch[:2], 2)
+
+                        results[key] = boxes
+                        # labels
+                        label_key = self.bbox2label.get(key)
+                        if label_key in results:
+                            results[label_key] = results[label_key][mask]
+
+                        # mask fields
+                        mask_key = self.bbox2mask.get(key)
+                        if mask_key in results:
+                            results[mask_key] = results[mask_key][
+                                mask.nonzero()[0]].crop(patch)
+                # adjust the img no matter whether the gt is empty before crop
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                results['img'] = img
+                results['img_shape'] = img.shape
+
+                # seg fields
+                for key in results.get('seg_fields', []):
+                    results[key] = results[key][patch[1]:patch[3],
+                                                patch[0]:patch[2]]
+                return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_ious}, '
+        repr_str += f'min_crop_size={self.min_crop_size}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Corrupt:
+    """Corruption augmentation.
+
+    Corruption transforms implemented based on
+    `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
+
+    Args:
+        corruption (str): Corruption name.
+        severity (int, optional): The severity of corruption. Default: 1.
+    """
+
+    def __init__(self, corruption, severity=1):
+        self.corruption = corruption
+        self.severity = severity
+
+    def __call__(self, results):
+        """Call function to corrupt image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images corrupted.
+        """
+
+        if corrupt is None:
+            raise RuntimeError('imagecorruptions is not installed')
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(corruption={self.corruption}, '
+        repr_str += f'severity={self.severity})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Albu:
+    """Albumentation augmentation.
+
+    Adds custom transformations from Albumentations library.
+    Please, visit `https://albumentations.readthedocs.io`
+    to get more information.
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block::
+
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        bbox_params (dict): Bbox_params for albumentation `Compose`
+        keymap (dict): Contains {'input key':'albumentation-style key'}
+        skip_img_without_anno (bool): Whether to skip the image if no ann left
+            after aug
+    """
+
+    def __init__(self,
+                 transforms,
+                 bbox_params=None,
+                 keymap=None,
+                 update_pad_shape=False,
+                 skip_img_without_anno=False):
+        if Compose is None:
+            raise RuntimeError('albumentations is not installed')
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+        if bbox_params is not None:
+            bbox_params = copy.deepcopy(bbox_params)
+        if keymap is not None:
+            keymap = copy.deepcopy(keymap)
+        self.transforms = transforms
+        self.filter_lost_elements = False
+        self.update_pad_shape = update_pad_shape
+        self.skip_img_without_anno = skip_img_without_anno
+
+        # A simple workaround to remove masks without boxes
+        if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+                and 'filter_lost_elements' in bbox_params):
+            self.filter_lost_elements = True
+            self.origin_label_fields = bbox_params['label_fields']
+            bbox_params['label_fields'] = ['idx_mapper']
+            del bbox_params['filter_lost_elements']
+
+        self.bbox_params = (
+            self.albu_builder(bbox_params) if bbox_params else None)
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           bbox_params=self.bbox_params)
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+                'gt_masks': 'masks',
+                'gt_bboxes': 'bboxes'
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg):
+        """Import a module from albumentations.
+
+        It inherits some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmcv.is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d, keymap):
+        """Dictionary mapper. Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {}
+        for k, v in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    def __call__(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+        # TODO: add bbox_fields
+        if 'bboxes' in results:
+            # to list of boxes
+            if isinstance(results['bboxes'], np.ndarray):
+                results['bboxes'] = [x for x in results['bboxes']]
+            # add pseudo-field for filtration
+            if self.filter_lost_elements:
+                results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+        # TODO: Support mask structure in albu
+        if 'masks' in results:
+            if isinstance(results['masks'], PolygonMasks):
+                raise NotImplementedError(
+                    'Albu only supports BitMap masks now')
+            ori_masks = results['masks']
+            if albumentations.__version__ < '0.5':
+                results['masks'] = results['masks'].masks
+            else:
+                results['masks'] = [mask for mask in results['masks'].masks]
+
+        results = self.aug(**results)
+
+        if 'bboxes' in results:
+            if isinstance(results['bboxes'], list):
+                results['bboxes'] = np.array(
+                    results['bboxes'], dtype=np.float32)
+            results['bboxes'] = results['bboxes'].reshape(-1, 4)
+
+            # filter label_fields
+            if self.filter_lost_elements:
+
+                for label in self.origin_label_fields:
+                    results[label] = np.array(
+                        [results[label][i] for i in results['idx_mapper']])
+                if 'masks' in results:
+                    results['masks'] = np.array(
+                        [results['masks'][i] for i in results['idx_mapper']])
+                    results['masks'] = ori_masks.__class__(
+                        results['masks'], results['image'].shape[0],
+                        results['image'].shape[1])
+
+                if (not len(results['idx_mapper'])
+                        and self.skip_img_without_anno):
+                    return None
+
+        if 'gt_labels' in results:
+            if isinstance(results['gt_labels'], list):
+                results['gt_labels'] = np.array(results['gt_labels'])
+            results['gt_labels'] = results['gt_labels'].astype(np.int64)
+
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        # update final shape
+        if self.update_pad_shape:
+            results['pad_shape'] = results['img'].shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCenterCropPad:
+    """Random center crop and random around padding for CornerNet.
+
+    This operation generates randomly cropped image from the original image and
+    pads it simultaneously. Different from :class:`RandomCrop`, the output
+    shape may not equal to ``crop_size`` strictly. We choose a random value
+    from ``ratios`` and the output shape could be larger or smaller than
+    ``crop_size``. The padding operation is also different from :class:`Pad`,
+    here we use around padding instead of right-bottom padding.
+
+    The relation between output image (padding image) and original image:
+
+    .. code:: text
+
+                        output image
+
+               +----------------------------+
+               |          padded area       |
+        +------|----------------------------|----------+
+        |      |         cropped area       |          |
+        |      |         +---------------+  |          |
+        |      |         |    .   center |  |          | original image
+        |      |         |        range  |  |          |
+        |      |         +---------------+  |          |
+        +------|----------------------------|----------+
+               |          padded area       |
+               +----------------------------+
+
+    There are 5 main areas in the figure:
+
+    - output image: output image of this operation, also called padding
+      image in following instruction.
+    - original image: input image of this operation.
+    - padded area: non-intersect area of output image and original image.
+    - cropped area: the overlap of output image and original image.
+    - center range: a smaller area where random center chosen from.
+      center range is computed by ``border`` and original image's shape
+      to avoid our random center is too close to original image's border.
+
+    Also this operation act differently in train and test mode, the summary
+    pipeline is listed below.
+
+    Train pipeline:
+
+    1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+       will be ``random_ratio * crop_size``.
+    2. Choose a ``random_center`` in center range.
+    3. Generate padding image with center matches the ``random_center``.
+    4. Initialize the padding image with pixel value equals to ``mean``.
+    5. Copy the cropped area to padding image.
+    6. Refine annotations.
+
+    Test pipeline:
+
+    1. Compute output shape according to ``test_pad_mode``.
+    2. Generate padding image with center matches the original image
+       center.
+    3. Initialize the padding image with pixel value equals to ``mean``.
+    4. Copy the ``cropped area`` to padding image.
+
+    Args:
+        crop_size (tuple | None): expected size after crop, final size will
+            computed according to ratio. Requires (h, w) in train mode, and
+            None in test mode.
+        ratios (tuple): random select a ratio from tuple and crop image to
+            (crop_size[0] * ratio) * (crop_size[1] * ratio).
+            Only available in train mode.
+        border (int): max distance from center select area to image border.
+            Only available in train mode.
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB.
+        test_mode (bool): whether involve random variables in transform.
+            In train mode, crop_size is fixed, center coords and ratio is
+            random selected from predefined lists. In test mode, crop_size
+            is image's original shape, center coords and ratio is fixed.
+        test_pad_mode (tuple): padding method and padding shape value, only
+            available in test mode. Default is using 'logical_or' with
+            127 as padding shape value.
+
+            - 'logical_or': final_shape = input_shape | padding_shape_value
+            - 'size_divisor': final_shape = int(
+              ceil(input_shape / padding_shape_value) * padding_shape_value)
+        test_pad_add_pix (int): Extra padding pixel in test mode. Default 0.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size=None,
+                 ratios=(0.9, 1.0, 1.1),
+                 border=128,
+                 mean=None,
+                 std=None,
+                 to_rgb=None,
+                 test_mode=False,
+                 test_pad_mode=('logical_or', 127),
+                 test_pad_add_pix=0,
+                 bbox_clip_border=True):
+        if test_mode:
+            assert crop_size is None, 'crop_size must be None in test mode'
+            assert ratios is None, 'ratios must be None in test mode'
+            assert border is None, 'border must be None in test mode'
+            assert isinstance(test_pad_mode, (list, tuple))
+            assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+        else:
+            assert isinstance(crop_size, (list, tuple))
+            assert crop_size[0] > 0 and crop_size[1] > 0, (
+                'crop_size must > 0 in train mode')
+            assert isinstance(ratios, (list, tuple))
+            assert test_pad_mode is None, (
+                'test_pad_mode must be None in train mode')
+
+        self.crop_size = crop_size
+        self.ratios = ratios
+        self.border = border
+        # We do not set default value to mean, std and to_rgb because these
+        # hyper-parameters are easy to forget but could affect the performance.
+        # Please use the same setting as Normalize for performance assurance.
+        assert mean is not None and std is not None and to_rgb is not None
+        self.to_rgb = to_rgb
+        self.input_mean = mean
+        self.input_std = std
+        if to_rgb:
+            self.mean = mean[::-1]
+            self.std = std[::-1]
+        else:
+            self.mean = mean
+            self.std = std
+        self.test_mode = test_mode
+        self.test_pad_mode = test_pad_mode
+        self.test_pad_add_pix = test_pad_add_pix
+        self.bbox_clip_border = bbox_clip_border
+
+    def _get_border(self, border, size):
+        """Get final border for the target size.
+
+        This function generates a ``final_border`` according to image's shape.
+        The area between ``final_border`` and ``size - final_border`` is the
+        ``center range``. We randomly choose center from the ``center range``
+        to avoid our random center is too close to original image's border.
+        Also ``center range`` should be larger than 0.
+
+        Args:
+            border (int): The initial border, default is 128.
+            size (int): The width or height of original image.
+        Returns:
+            int: The final border.
+        """
+        k = 2 * border / size
+        i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+        return border // i
+
+    def _filter_boxes(self, patch, boxes):
+        """Check whether the center of each box is in the patch.
+
+        Args:
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+            boxes (numpy array, (N x 4)): Ground truth boxes.
+
+        Returns:
+            mask (numpy array, (N,)): Each box is inside or outside the patch.
+        """
+        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+        mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+            center[:, 0] < patch[2]) * (
+                center[:, 1] < patch[3])
+        return mask
+
+    def _crop_image_and_paste(self, image, center, size):
+        """Crop image with a given center and size, then paste the cropped
+        image to a blank image with two centers align.
+
+        This function is equivalent to generating a blank image with ``size``
+        as its shape. Then cover it on the original image with two centers (
+        the center of blank image and the random center of original image)
+        aligned. The overlap area is paste from the original image and the
+        outside area is filled with ``mean pixel``.
+
+        Args:
+            image (np array, H x W x C): Original image.
+            center (list[int]): Target crop center coord.
+            size (list[int]): Target crop size. [target_h, target_w]
+
+        Returns:
+            cropped_img (np array, target_h x target_w x C): Cropped image.
+            border (np array, 4): The distance of four border of
+                ``cropped_img`` to the original image area, [top, bottom,
+                left, right]
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+        """
+        center_y, center_x = center
+        target_h, target_w = size
+        img_h, img_w, img_c = image.shape
+
+        x0 = max(0, center_x - target_w // 2)
+        x1 = min(center_x + target_w // 2, img_w)
+        y0 = max(0, center_y - target_h // 2)
+        y1 = min(center_y + target_h // 2, img_h)
+        patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+        left, right = center_x - x0, x1 - center_x
+        top, bottom = center_y - y0, y1 - center_y
+
+        cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+        cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+        for i in range(img_c):
+            cropped_img[:, :, i] += self.mean[i]
+        y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+        x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+        cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+        border = np.array([
+            cropped_center_y - top, cropped_center_y + bottom,
+            cropped_center_x - left, cropped_center_x + right
+        ],
+                          dtype=np.float32)
+
+        return cropped_img, border, patch
+
+    def _train_aug(self, results):
+        """Random crop and around padding the original image.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        boxes = results['gt_bboxes']
+        while True:
+            scale = random.choice(self.ratios)
+            new_h = int(self.crop_size[0] * scale)
+            new_w = int(self.crop_size[1] * scale)
+            h_border = self._get_border(self.border, h)
+            w_border = self._get_border(self.border, w)
+
+            for i in range(50):
+                center_x = random.randint(low=w_border, high=w - w_border)
+                center_y = random.randint(low=h_border, high=h - h_border)
+
+                cropped_img, border, patch = self._crop_image_and_paste(
+                    img, [center_y, center_x], [new_h, new_w])
+
+                mask = self._filter_boxes(patch, boxes)
+                # if image do not have valid bbox, any crop patch is valid.
+                if not mask.any() and len(boxes) > 0:
+                    continue
+
+                results['img'] = cropped_img
+                results['img_shape'] = cropped_img.shape
+                results['pad_shape'] = cropped_img.shape
+
+                x0, y0, x1, y1 = patch
+
+                left_w, top_h = center_x - x0, center_y - y0
+                cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+                # crop bboxes accordingly and clip to the image boundary
+                for key in results.get('bbox_fields', []):
+                    mask = self._filter_boxes(patch, results[key])
+                    bboxes = results[key][mask]
+                    bboxes[:, 0:4:2] += cropped_center_x - left_w - x0
+                    bboxes[:, 1:4:2] += cropped_center_y - top_h - y0
+                    if self.bbox_clip_border:
+                        bboxes[:, 0:4:2] = np.clip(bboxes[:, 0:4:2], 0, new_w)
+                        bboxes[:, 1:4:2] = np.clip(bboxes[:, 1:4:2], 0, new_h)
+                    keep = (bboxes[:, 2] > bboxes[:, 0]) & (
+                        bboxes[:, 3] > bboxes[:, 1])
+                    bboxes = bboxes[keep]
+                    results[key] = bboxes
+                    if key in ['gt_bboxes']:
+                        if 'gt_labels' in results:
+                            labels = results['gt_labels'][mask]
+                            labels = labels[keep]
+                            results['gt_labels'] = labels
+                        if 'gt_masks' in results:
+                            raise NotImplementedError(
+                                'RandomCenterCropPad only supports bbox.')
+
+                # crop semantic seg
+                for key in results.get('seg_fields', []):
+                    raise NotImplementedError(
+                        'RandomCenterCropPad only supports bbox.')
+                return results
+
+    def _test_aug(self, results):
+        """Around padding the original image without cropping.
+
+        The padding mode and value are from ``test_pad_mode``.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        results['img_shape'] = img.shape
+        if self.test_pad_mode[0] in ['logical_or']:
+            # self.test_pad_add_pix is only used for centernet
+            target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
+            target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
+        elif self.test_pad_mode[0] in ['size_divisor']:
+            divisor = self.test_pad_mode[1]
+            target_h = int(np.ceil(h / divisor)) * divisor
+            target_w = int(np.ceil(w / divisor)) * divisor
+        else:
+            raise NotImplementedError(
+                'RandomCenterCropPad only support two testing pad mode:'
+                'logical-or and size_divisor.')
+
+        cropped_img, border, _ = self._crop_image_and_paste(
+            img, [h // 2, w // 2], [target_h, target_w])
+        results['img'] = cropped_img
+        results['pad_shape'] = cropped_img.shape
+        results['border'] = border
+        return results
+
+    def __call__(self, results):
+        img = results['img']
+        assert img.dtype == np.float32, (
+            'RandomCenterCropPad needs the input image of dtype np.float32,'
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+        h, w, c = img.shape
+        assert c == len(self.mean)
+        if self.test_mode:
+            return self._test_aug(results)
+        else:
+            return self._train_aug(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'ratios={self.ratios}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'mean={self.input_mean}, '
+        repr_str += f'std={self.input_std}, '
+        repr_str += f'to_rgb={self.to_rgb}, '
+        repr_str += f'test_mode={self.test_mode}, '
+        repr_str += f'test_pad_mode={self.test_pad_mode}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CutOut:
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Args:
+        n_holes (int | tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+        cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+            shape of dropped regions. It can be `tuple[int, int]` to use a
+            fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+            shape from the list.
+        cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+            candidate ratio of dropped regions. It can be `tuple[float, float]`
+            to use a fixed ratio or `list[tuple[float, float]]` to randomly
+            choose ratio from the list. Please note that `cutout_shape`
+            and `cutout_ratio` cannot be both given at the same time.
+        fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Default: (0, 0, 0).
+    """
+
+    def __init__(self,
+                 n_holes,
+                 cutout_shape=None,
+                 cutout_ratio=None,
+                 fill_in=(0, 0, 0)):
+
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    def __call__(self, results):
+        """Call function to drop some regions of image."""
+        h, w, c = results['img'].shape
+        n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        for _ in range(n_holes):
+            x1 = np.random.randint(0, w)
+            y1 = np.random.randint(0, h)
+            index = np.random.randint(0, len(self.candidates))
+            if not self.with_ratio:
+                cutout_w, cutout_h = self.candidates[index]
+            else:
+                cutout_w = int(self.candidates[index][0] * w)
+                cutout_h = int(self.candidates[index][1] * h)
+
+            x2 = np.clip(x1 + cutout_w, 0, w)
+            y2 = np.clip(y1 + cutout_h, 0, h)
+            results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Mosaic:
+    """Mosaic augmentation.
+
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (height, width).
+            Default to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Default to (0.5, 1.5).
+        min_bbox_size (int | float): The minimum pixel for filtering
+            invalid bboxes after the mosaic pipeline. Default to 0.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        skip_filter (bool): Whether to skip filtering rules. If it
+            is True, the filter rule will not be applied, and the
+            `min_bbox_size` is invalid. Default to True.
+        pad_val (int): Pad value. Default to 114.
+        prob (float): Probability of applying this transformation.
+            Default to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale=(640, 640),
+                 center_ratio_range=(0.5, 1.5),
+                 min_bbox_size=0,
+                 bbox_clip_border=True,
+                 skip_filter=True,
+                 pad_val=114,
+                 prob=1.0):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
+            f'got {prob}.'
+
+        log_img_scale(img_scale, skip_square=True)
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.min_bbox_size = min_bbox_size
+        self.bbox_clip_border = bbox_clip_border
+        self.skip_filter = skip_filter
+        self.pad_val = pad_val
+        self.prob = prob
+
+    def __call__(self, results):
+        """Call function to make a mosaic of image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with mosaic transformed.
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        results = self._mosaic_transform(results)
+        return results
+
+    def get_indexes(self, dataset):
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    def _mosaic_transform(self, results):
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        mosaic_labels = []
+        mosaic_bboxes = []
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[0] / h_i,
+                                self.img_scale[1] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_labels_i = results_patch['gt_labels']
+
+            if gt_bboxes_i.shape[0] > 0:
+                padw = x1_p - x1_c
+                padh = y1_p - y1_c
+                gt_bboxes_i[:, 0::2] = \
+                    scale_ratio_i * gt_bboxes_i[:, 0::2] + padw
+                gt_bboxes_i[:, 1::2] = \
+                    scale_ratio_i * gt_bboxes_i[:, 1::2] + padh
+
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_labels.append(gt_labels_i)
+
+        if len(mosaic_labels) > 0:
+            mosaic_bboxes = np.concatenate(mosaic_bboxes, 0)
+            mosaic_labels = np.concatenate(mosaic_labels, 0)
+
+            if self.bbox_clip_border:
+                mosaic_bboxes[:, 0::2] = np.clip(mosaic_bboxes[:, 0::2], 0,
+                                                 2 * self.img_scale[1])
+                mosaic_bboxes[:, 1::2] = np.clip(mosaic_bboxes[:, 1::2], 0,
+                                                 2 * self.img_scale[0])
+
+            if not self.skip_filter:
+                mosaic_bboxes, mosaic_labels = \
+                    self._filter_box_candidates(mosaic_bboxes, mosaic_labels)
+
+        # remove outside bboxes
+        inside_inds = find_inside_bboxes(mosaic_bboxes, 2 * self.img_scale[0],
+                                         2 * self.img_scale[1])
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_labels = mosaic_labels[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_labels'] = mosaic_labels
+
+        return results
+
+    def _mosaic_combine(self, loc, center_position_xy, img_shape_wh):
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def _filter_box_candidates(self, bboxes, labels):
+        """Filter out bboxes too small after Mosaic."""
+        bbox_w = bboxes[:, 2] - bboxes[:, 0]
+        bbox_h = bboxes[:, 3] - bboxes[:, 1]
+        valid_inds = (bbox_w > self.min_bbox_size) & \
+                     (bbox_h > self.min_bbox_size)
+        valid_inds = np.nonzero(valid_inds)[0]
+        return bboxes[valid_inds], labels[valid_inds]
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'min_bbox_size={self.min_bbox_size}, '
+        repr_str += f'skip_filter={self.skip_filter})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MixUp:
+    """MixUp data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (height, width). Default: (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Default: (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Default: 0.5.
+        pad_val (int): Pad value. Default: 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Default: 15.
+        min_bbox_size (float): Width and height threshold to filter bboxes.
+            If the height or width of a box is smaller than this value, it
+            will be removed. Default: 5.
+        min_area_ratio (float): Threshold of area ratio between
+            original bboxes and wrapped bboxes. If smaller than this value,
+            the box will be removed. Default: 0.2.
+        max_aspect_ratio (float): Aspect ratio of width and height
+            threshold to filter bboxes. If max(h/w, w/h) larger than this
+            value, the box will be removed. Default: 20.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        skip_filter (bool): Whether to skip filtering rules. If it
+            is True, the filter rule will not be applied, and the
+            `min_bbox_size` and `min_area_ratio` and `max_aspect_ratio`
+            is invalid. Default to True.
+    """
+
+    def __init__(self,
+                 img_scale=(640, 640),
+                 ratio_range=(0.5, 1.5),
+                 flip_ratio=0.5,
+                 pad_val=114,
+                 max_iters=15,
+                 min_bbox_size=5,
+                 min_area_ratio=0.2,
+                 max_aspect_ratio=20,
+                 bbox_clip_border=True,
+                 skip_filter=True):
+        assert isinstance(img_scale, tuple)
+        log_img_scale(img_scale, skip_square=True)
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.min_bbox_size = min_bbox_size
+        self.min_area_ratio = min_area_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.bbox_clip_border = bbox_clip_border
+        self.skip_filter = skip_filter
+
+    def __call__(self, results):
+        """Call function to make a mixup of image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with mixup transformed.
+        """
+
+        results = self._mixup_transform(results)
+        return results
+
+    def get_indexes(self, dataset):
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(dataset))
+            gt_bboxes_i = dataset.get_ann_info(index)['bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+
+        return index
+
+    def _mixup_transform(self, results):
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_filp = random.uniform(0, 1) < self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[0], self.dynamic_scale[1], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale, dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[0] / retrieve_img.shape[0],
+                          self.dynamic_scale[1] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_filp:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w,
+                                          target_w), 3)).astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes[:, 0::2] = retrieve_gt_bboxes[:, 0::2] * scale_ratio
+        retrieve_gt_bboxes[:, 1::2] = retrieve_gt_bboxes[:, 1::2] * scale_ratio
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes[:, 0::2] = np.clip(retrieve_gt_bboxes[:, 0::2],
+                                                  0, origin_w)
+            retrieve_gt_bboxes[:, 1::2] = np.clip(retrieve_gt_bboxes[:, 1::2],
+                                                  0, origin_h)
+
+        if is_filp:
+            retrieve_gt_bboxes[:, 0::2] = (
+                origin_w - retrieve_gt_bboxes[:, 0::2][:, ::-1])
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.copy()
+        cp_retrieve_gt_bboxes[:, 0::2] = \
+            cp_retrieve_gt_bboxes[:, 0::2] - x_offset
+        cp_retrieve_gt_bboxes[:, 1::2] = \
+            cp_retrieve_gt_bboxes[:, 1::2] - y_offset
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes[:, 0::2] = np.clip(
+                cp_retrieve_gt_bboxes[:, 0::2], 0, target_w)
+            cp_retrieve_gt_bboxes[:, 1::2] = np.clip(
+                cp_retrieve_gt_bboxes[:, 1::2], 0, target_h)
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_labels = retrieve_results['gt_labels']
+        if not self.skip_filter:
+            keep_list = self._filter_box_candidates(retrieve_gt_bboxes.T,
+                                                    cp_retrieve_gt_bboxes.T)
+
+            retrieve_gt_labels = retrieve_gt_labels[keep_list]
+            cp_retrieve_gt_bboxes = cp_retrieve_gt_bboxes[keep_list]
+
+        mixup_gt_bboxes = np.concatenate(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), axis=0)
+        mixup_gt_labels = np.concatenate(
+            (results['gt_labels'], retrieve_gt_labels), axis=0)
+
+        # remove outside bbox
+        inside_inds = find_inside_bboxes(mixup_gt_bboxes, target_h, target_w)
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_labels = mixup_gt_labels[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_labels'] = mixup_gt_labels
+
+        return results
+
+    def _filter_box_candidates(self, bbox1, bbox2):
+        """Compute candidate boxes which include following 5 things:
+
+        bbox1 before augment, bbox2 after augment, min_bbox_size (pixels),
+        min_area_ratio, max_aspect_ratio.
+        """
+
+        w1, h1 = bbox1[2] - bbox1[0], bbox1[3] - bbox1[1]
+        w2, h2 = bbox2[2] - bbox2[0], bbox2[3] - bbox2[1]
+        ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))
+        return ((w2 > self.min_bbox_size)
+                & (h2 > self.min_bbox_size)
+                & (w2 * h2 / (w1 * h1 + 1e-16) > self.min_area_ratio)
+                & (ar < self.max_aspect_ratio))
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'min_bbox_size={self.min_bbox_size}, '
+        repr_str += f'min_area_ratio={self.min_area_ratio}, '
+        repr_str += f'max_aspect_ratio={self.max_aspect_ratio}, '
+        repr_str += f'skip_filter={self.skip_filter})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomAffine:
+    """Random affine transform data augmentation.
+
+    This operation randomly generates affine transform matrix which including
+    rotation, translation, shear and scaling transforms.
+
+    Args:
+        max_rotate_degree (float): Maximum degrees of rotation transform.
+            Default: 10.
+        max_translate_ratio (float): Maximum ratio of translation.
+            Default: 0.1.
+        scaling_ratio_range (tuple[float]): Min and max ratio of
+            scaling transform. Default: (0.5, 1.5).
+        max_shear_degree (float): Maximum degrees of shear
+            transform. Default: 2.
+        border (tuple[int]): Distance from height and width sides of input
+            image to adjust output shape. Only used in mosaic dataset.
+            Default: (0, 0).
+        border_val (tuple[int]): Border padding values of 3 channels.
+            Default: (114, 114, 114).
+        min_bbox_size (float): Width and height threshold to filter bboxes.
+            If the height or width of a box is smaller than this value, it
+            will be removed. Default: 2.
+        min_area_ratio (float): Threshold of area ratio between
+            original bboxes and wrapped bboxes. If smaller than this value,
+            the box will be removed. Default: 0.2.
+        max_aspect_ratio (float): Aspect ratio of width and height
+            threshold to filter bboxes. If max(h/w, w/h) larger than this
+            value, the box will be removed.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        skip_filter (bool): Whether to skip filtering rules. If it
+            is True, the filter rule will not be applied, and the
+            `min_bbox_size` and `min_area_ratio` and `max_aspect_ratio`
+            is invalid. Default to True.
+    """
+
+    def __init__(self,
+                 max_rotate_degree=10.0,
+                 max_translate_ratio=0.1,
+                 scaling_ratio_range=(0.5, 1.5),
+                 max_shear_degree=2.0,
+                 border=(0, 0),
+                 border_val=(114, 114, 114),
+                 min_bbox_size=2,
+                 min_area_ratio=0.2,
+                 max_aspect_ratio=20,
+                 bbox_clip_border=True,
+                 skip_filter=True):
+        assert 0 <= max_translate_ratio <= 1
+        assert scaling_ratio_range[0] <= scaling_ratio_range[1]
+        assert scaling_ratio_range[0] > 0
+        self.max_rotate_degree = max_rotate_degree
+        self.max_translate_ratio = max_translate_ratio
+        self.scaling_ratio_range = scaling_ratio_range
+        self.max_shear_degree = max_shear_degree
+        self.border = border
+        self.border_val = border_val
+        self.min_bbox_size = min_bbox_size
+        self.min_area_ratio = min_area_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.bbox_clip_border = bbox_clip_border
+        self.skip_filter = skip_filter
+
+    def __call__(self, results):
+        img = results['img']
+        height = img.shape[0] + self.border[0] * 2
+        width = img.shape[1] + self.border[1] * 2
+
+        # Rotation
+        rotation_degree = random.uniform(-self.max_rotate_degree,
+                                         self.max_rotate_degree)
+        rotation_matrix = self._get_rotation_matrix(rotation_degree)
+
+        # Scaling
+        scaling_ratio = random.uniform(self.scaling_ratio_range[0],
+                                       self.scaling_ratio_range[1])
+        scaling_matrix = self._get_scaling_matrix(scaling_ratio)
+
+        # Shear
+        x_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        y_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        shear_matrix = self._get_shear_matrix(x_degree, y_degree)
+
+        # Translation
+        trans_x = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * width
+        trans_y = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * height
+        translate_matrix = self._get_translation_matrix(trans_x, trans_y)
+
+        warp_matrix = (
+            translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
+
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key]
+            num_bboxes = len(bboxes)
+            if num_bboxes:
+                # homogeneous coordinates
+                xs = bboxes[:, [0, 0, 2, 2]].reshape(num_bboxes * 4)
+                ys = bboxes[:, [1, 3, 3, 1]].reshape(num_bboxes * 4)
+                ones = np.ones_like(xs)
+                points = np.vstack([xs, ys, ones])
+
+                warp_points = warp_matrix @ points
+                warp_points = warp_points[:2] / warp_points[2]
+                xs = warp_points[0].reshape(num_bboxes, 4)
+                ys = warp_points[1].reshape(num_bboxes, 4)
+
+                warp_bboxes = np.vstack(
+                    (xs.min(1), ys.min(1), xs.max(1), ys.max(1))).T
+
+                if self.bbox_clip_border:
+                    warp_bboxes[:, [0, 2]] = \
+                        warp_bboxes[:, [0, 2]].clip(0, width)
+                    warp_bboxes[:, [1, 3]] = \
+                        warp_bboxes[:, [1, 3]].clip(0, height)
+
+                # remove outside bbox
+                valid_index = find_inside_bboxes(warp_bboxes, height, width)
+                if not self.skip_filter:
+                    # filter bboxes
+                    filter_index = self.filter_gt_bboxes(
+                        bboxes * scaling_ratio, warp_bboxes)
+                    valid_index = valid_index & filter_index
+
+                results[key] = warp_bboxes[valid_index]
+                if key in ['gt_bboxes']:
+                    if 'gt_labels' in results:
+                        results['gt_labels'] = results['gt_labels'][
+                            valid_index]
+
+                if 'gt_masks' in results:
+                    raise NotImplementedError(
+                        'RandomAffine only supports bbox.')
+        return results
+
+    def filter_gt_bboxes(self, origin_bboxes, wrapped_bboxes):
+        origin_w = origin_bboxes[:, 2] - origin_bboxes[:, 0]
+        origin_h = origin_bboxes[:, 3] - origin_bboxes[:, 1]
+        wrapped_w = wrapped_bboxes[:, 2] - wrapped_bboxes[:, 0]
+        wrapped_h = wrapped_bboxes[:, 3] - wrapped_bboxes[:, 1]
+        aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16),
+                                  wrapped_h / (wrapped_w + 1e-16))
+
+        wh_valid_idx = (wrapped_w > self.min_bbox_size) & \
+                       (wrapped_h > self.min_bbox_size)
+        area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h +
+                                                  1e-16) > self.min_area_ratio
+        aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio
+        return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
+        repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
+        repr_str += f'scaling_ratio={self.scaling_ratio_range}, '
+        repr_str += f'max_shear_degree={self.max_shear_degree}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'border_val={self.border_val}, '
+        repr_str += f'min_bbox_size={self.min_bbox_size}, '
+        repr_str += f'min_area_ratio={self.min_area_ratio}, '
+        repr_str += f'max_aspect_ratio={self.max_aspect_ratio}, '
+        repr_str += f'skip_filter={self.skip_filter})'
+        return repr_str
+
+    @staticmethod
+    def _get_rotation_matrix(rotate_degrees):
+        radian = math.radians(rotate_degrees)
+        rotation_matrix = np.array(
+            [[np.cos(radian), -np.sin(radian), 0.],
+             [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return rotation_matrix
+
+    @staticmethod
+    def _get_scaling_matrix(scale_ratio):
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_share_matrix(scale_ratio):
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_shear_matrix(x_shear_degrees, y_shear_degrees):
+        x_radian = math.radians(x_shear_degrees)
+        y_radian = math.radians(y_shear_degrees)
+        shear_matrix = np.array([[1, np.tan(x_radian), 0.],
+                                 [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
+                                dtype=np.float32)
+        return shear_matrix
+
+    @staticmethod
+    def _get_translation_matrix(x, y):
+        translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
+                                      dtype=np.float32)
+        return translation_matrix
+
+
+@PIPELINES.register_module()
+class YOLOXHSVRandomAug:
+    """Apply HSV augmentation to image sequentially. It is referenced from
+    https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21.
+
+    Args:
+        hue_delta (int): delta of hue. Default: 5.
+        saturation_delta (int): delta of saturation. Default: 30.
+        value_delta (int): delat of value. Default: 30.
+    """
+
+    def __init__(self, hue_delta=5, saturation_delta=30, value_delta=30):
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+
+    def __call__(self, results):
+        img = results['img']
+        hsv_gains = np.random.uniform(-1, 1, 3) * [
+            self.hue_delta, self.saturation_delta, self.value_delta
+        ]
+        # random selection of h, s, v
+        hsv_gains *= np.random.randint(0, 2, 3)
+        # prevent overflow
+        hsv_gains = hsv_gains.astype(np.int16)
+        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+        img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
+        img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
+        img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
+        cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CopyPaste:
+    """Simple Copy-Paste is a Strong Data Augmentation Method for Instance
+    Segmentation The simple copy-paste transform steps are as follows:
+
+    1. The destination image is already resized with aspect ratio kept,
+       cropped and padded.
+    2. Randomly select a source image, which is also already resized
+       with aspect ratio kept, cropped and padded in a similar way
+       as the destination image.
+    3. Randomly select some objects from the source image.
+    4. Paste these source objects to the destination image directly,
+       due to the source and destination image have the same size.
+    5. Update object masks of the destination image, for some origin objects
+       may be occluded.
+    6. Generate bboxes from the updated destination masks and
+       filter some objects which are totally occluded, and adjust bboxes
+       which are partly occluded.
+    7. Append selected source bboxes, masks, and labels.
+
+    Args:
+        max_num_pasted (int): The maximum number of pasted objects.
+            Default: 100.
+        bbox_occluded_thr (int): The threshold of occluded bbox.
+            Default: 10.
+        mask_occluded_thr (int): The threshold of occluded mask.
+            Default: 300.
+        selected (bool): Whether select objects or not. If select is False,
+            all objects of the source image will be pasted to the
+            destination image.
+            Default: True.
+    """
+
+    def __init__(
+        self,
+        max_num_pasted=100,
+        bbox_occluded_thr=10,
+        mask_occluded_thr=300,
+        selected=True,
+    ):
+        self.max_num_pasted = max_num_pasted
+        self.bbox_occluded_thr = bbox_occluded_thr
+        self.mask_occluded_thr = mask_occluded_thr
+        self.selected = selected
+        self.paste_by_box = False
+
+    def get_indexes(self, dataset):
+        """Call function to collect indexes.s.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+        Returns:
+            list: Indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    def gen_masks_from_bboxes(self, bboxes, img_shape):
+        """Generate gt_masks based on gt_bboxes.
+
+        Args:
+            bboxes (list): The bboxes's list.
+            img_shape (tuple): The shape of image.
+        Returns:
+            BitmapMasks
+        """
+        self.paste_by_box = True
+        img_h, img_w = img_shape[:2]
+        xmin, ymin = bboxes[:, 0:1], bboxes[:, 1:2]
+        xmax, ymax = bboxes[:, 2:3], bboxes[:, 3:4]
+        gt_masks = np.zeros((len(bboxes), img_h, img_w), dtype=np.uint8)
+        for i in range(len(bboxes)):
+            gt_masks[i,
+                     int(ymin[i]):int(ymax[i]),
+                     int(xmin[i]):int(xmax[i])] = 1
+        return BitmapMasks(gt_masks, img_h, img_w)
+
+    def get_gt_masks(self, results):
+        """Get gt_masks originally or generated based on bboxes.
+
+        If gt_masks is not contained in results,
+        it will be generated based on gt_bboxes.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            BitmapMasks: gt_masks, originally or generated based on bboxes.
+        """
+        if results.get('gt_masks', None) is not None:
+            return results['gt_masks']
+        else:
+            return self.gen_masks_from_bboxes(
+                results.get('gt_bboxes', []), results['img'].shape)
+
+    def __call__(self, results):
+        """Call function to make a copy-paste of image.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Result dict with copy-paste transformed.
+        """
+
+        assert 'mix_results' in results
+        num_images = len(results['mix_results'])
+        assert num_images == 1, \
+            f'CopyPaste only supports processing 2 images, got {num_images}'
+
+        # Get gt_masks originally or generated based on bboxes.
+        results['gt_masks'] = self.get_gt_masks(results)
+        # only one mix picture
+        results['mix_results'][0]['gt_masks'] = self.get_gt_masks(
+            results['mix_results'][0])
+
+        if self.selected:
+            selected_results = self._select_object(results['mix_results'][0])
+        else:
+            selected_results = results['mix_results'][0]
+        return self._copy_paste(results, selected_results)
+
+    def _select_object(self, results):
+        """Select some objects from the source results."""
+        bboxes = results['gt_bboxes']
+        labels = results['gt_labels']
+        masks = results['gt_masks']
+        max_num_pasted = min(bboxes.shape[0] + 1, self.max_num_pasted)
+        num_pasted = np.random.randint(0, max_num_pasted)
+        selected_inds = np.random.choice(
+            bboxes.shape[0], size=num_pasted, replace=False)
+
+        selected_bboxes = bboxes[selected_inds]
+        selected_labels = labels[selected_inds]
+        selected_masks = masks[selected_inds]
+
+        results['gt_bboxes'] = selected_bboxes
+        results['gt_labels'] = selected_labels
+        results['gt_masks'] = selected_masks
+        return results
+
+    def _copy_paste(self, dst_results, src_results):
+        """CopyPaste transform function.
+
+        Args:
+            dst_results (dict): Result dict of the destination image.
+            src_results (dict): Result dict of the source image.
+        Returns:
+            dict: Updated result dict.
+        """
+        dst_img = dst_results['img']
+        dst_bboxes = dst_results['gt_bboxes']
+        dst_labels = dst_results['gt_labels']
+        dst_masks = dst_results['gt_masks']
+
+        src_img = src_results['img']
+        src_bboxes = src_results['gt_bboxes']
+        src_labels = src_results['gt_labels']
+        src_masks = src_results['gt_masks']
+
+        if len(src_bboxes) == 0:
+            if self.paste_by_box:
+                dst_results.pop('gt_masks')
+            return dst_results
+
+        # update masks and generate bboxes from updated masks
+        composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0)
+        updated_dst_masks = self.get_updated_masks(dst_masks, composed_mask)
+        updated_dst_bboxes = updated_dst_masks.get_bboxes()
+        assert len(updated_dst_bboxes) == len(updated_dst_masks)
+
+        # filter totally occluded objects
+        bboxes_inds = np.all(
+            np.abs(
+                (updated_dst_bboxes - dst_bboxes)) <= self.bbox_occluded_thr,
+            axis=-1)
+        masks_inds = updated_dst_masks.masks.sum(
+            axis=(1, 2)) > self.mask_occluded_thr
+        valid_inds = bboxes_inds | masks_inds
+
+        # Paste source objects to destination image directly
+        img = dst_img * (1 - composed_mask[..., np.newaxis]
+                         ) + src_img * composed_mask[..., np.newaxis]
+        bboxes = np.concatenate([updated_dst_bboxes[valid_inds], src_bboxes])
+        labels = np.concatenate([dst_labels[valid_inds], src_labels])
+        masks = np.concatenate(
+            [updated_dst_masks.masks[valid_inds], src_masks.masks])
+
+        dst_results['img'] = img
+        dst_results['gt_bboxes'] = bboxes
+        dst_results['gt_labels'] = labels
+        if self.paste_by_box:
+            dst_results.pop('gt_masks')
+        else:
+            dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1],
+                                                  masks.shape[2])
+
+        return dst_results
+
+    def get_updated_masks(self, masks, composed_mask):
+        assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \
+            'Cannot compare two arrays of different size'
+        masks.masks = np.where(composed_mask, 0, masks.masks)
+        return masks
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'max_num_pasted={self.max_num_pasted}, '
+        repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, '
+        repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, '
+        repr_str += f'selected={self.selected}, '
+        return repr_str
+
+@PIPELINES.register_module()
+class RGB2Gray(object):
+    """Convert RGB image to grayscale image.
+
+    This transform calculate the weighted mean of input image channels with
+    ``weights`` and then expand the channels to ``out_channels``. When
+    ``out_channels`` is None, the number of output channels is the same as
+    input channels.
+
+    Args:
+        out_channels (int): Expected number of output channels after
+            transforming. Default: None.
+        weights (tuple[float]): The weights to calculate the weighted mean.
+            Default: (0.299, 0.587, 0.114).
+    """
+
+    def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)):
+        assert out_channels is None or out_channels > 0
+        self.out_channels = out_channels
+        assert isinstance(weights, tuple)
+        for item in weights:
+            assert isinstance(item, (float, int))
+        self.weights = weights
+
+    def __call__(self, results):
+        """Call function to convert RGB image to grayscale image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with grayscale image.
+        """
+        img = results['img']
+        assert len(img.shape) == 3
+        assert img.shape[2] == len(self.weights)
+        weights = np.array(self.weights).reshape((1, 1, -1))
+        img = (img * weights).sum(2, keepdims=True)
+        if self.out_channels is None:
+            img = img.repeat(weights.shape[2], axis=2)
+        else:
+            img = img.repeat(self.out_channels, axis=2)
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(out_channels={self.out_channels}, ' \
+                    f'weights={self.weights})'
+        return repr_str
\ No newline at end of file
diff --git a/mmdet/datasets/pipelines/transforms.py.bk b/mmdet/datasets/pipelines/transforms.py.bk
new file mode 100755
index 0000000..c08c03b
--- /dev/null
+++ b/mmdet/datasets/pipelines/transforms.py.bk
@@ -0,0 +1,2973 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import math
+import warnings
+
+import cv2
+import mmcv
+import numpy as np
+from numpy import random
+
+from mmdet.core import BitmapMasks, PolygonMasks, find_inside_bboxes
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.utils import log_img_scale
+from ..builder import PIPELINES
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+
+@PIPELINES.register_module()
+class Resize:
+    """Resize images & bbox & mask.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 interpolation='bilinear',
+                 override=False):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.interpolation = interpolation
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in results.get('img_fields', ['img']):
+            if self.keep_ratio:
+                img, scale_factor = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+            results[key] = img
+
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
+            results['img_shape'] = img.shape
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in results.get('mask_fields', []):
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['img_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results[key] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['img'].shape[:2]
+                scale_factor = results['scale_factor']
+                assert isinstance(scale_factor, float)
+                results['scale'] = tuple(
+                    [int(x * scale_factor) for x in img_shape][::-1])
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip:
+    """Flip the image & bbox & mask.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    When random flip is enabled, ``flip_ratio``/``direction`` can either be a
+    float/string or tuple of float/string. There are 3 flip modes:
+
+    - ``flip_ratio`` is float, ``direction`` is string: the image will be
+        ``direction``ly flipped with probability of ``flip_ratio`` .
+        E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
+        then image will be horizontally flipped with probability of 0.5.
+    - ``flip_ratio`` is float, ``direction`` is list of string: the image will
+        be ``direction[i]``ly flipped with probability of
+        ``flip_ratio/len(direction)``.
+        E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
+        then image will be horizontally flipped with probability of 0.25,
+        vertically with probability of 0.25.
+    - ``flip_ratio`` is list of float, ``direction`` is list of string:
+        given ``len(flip_ratio) == len(direction)``, the image will
+        be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
+        E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
+        'vertical']``, then image will be horizontally flipped with probability
+        of 0.3, vertically with probability of 0.5.
+
+    Args:
+        flip_ratio (float | list[float], optional): The flipping probability.
+            Default: None.
+        direction(str | list[str], optional): The flipping direction. Options
+            are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
+            If input is a list, the length must equal ``flip_ratio``. Each
+            element in ``flip_ratio`` indicates the flip probability of
+            corresponding direction.
+    """
+
+    def __init__(self, flip_ratio=None, direction='horizontal'):
+        if isinstance(flip_ratio, list):
+            assert mmcv.is_list_of(flip_ratio, float)
+            assert 0 <= sum(flip_ratio) <= 1
+        elif isinstance(flip_ratio, float):
+            assert 0 <= flip_ratio <= 1
+        elif flip_ratio is None:
+            pass
+        else:
+            raise ValueError('flip_ratios must be None, float, '
+                             'or list of float')
+        self.flip_ratio = flip_ratio
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert mmcv.is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError('direction must be either str or list of str')
+        self.direction = direction
+
+        if isinstance(flip_ratio, list):
+            assert len(self.flip_ratio) == len(self.direction)
+
+    def bbox_flip(self, bboxes, img_shape, direction):
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        if direction == 'horizontal':
+            w = img_shape[1]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            h = img_shape[0]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            w = img_shape[1]
+            h = img_shape[0]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(f"Invalid flipping direction '{direction}'")
+        return flipped
+
+    def __call__(self, results):
+        """Call function to flip bounding boxes, masks, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction' keys are added \
+                into result dict.
+        """
+
+        if 'flip' not in results:
+            if isinstance(self.direction, list):
+                # None means non-flip
+                direction_list = self.direction + [None]
+            else:
+                # None means non-flip
+                direction_list = [self.direction, None]
+
+            if isinstance(self.flip_ratio, list):
+                non_flip_ratio = 1 - sum(self.flip_ratio)
+                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
+            else:
+                non_flip_ratio = 1 - self.flip_ratio
+                # exclude non-flip
+                single_ratio = self.flip_ratio / (len(direction_list) - 1)
+                flip_ratio_list = [single_ratio] * (len(direction_list) -
+                                                    1) + [non_flip_ratio]
+
+            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
+
+            results['flip'] = cur_dir is not None
+        if 'flip_direction' not in results:
+            results['flip_direction'] = cur_dir
+        if results['flip']:
+            # flip image
+            for key in results.get('img_fields', ['img']):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+            # flip bboxes
+            for key in results.get('bbox_fields', []):
+                results[key] = self.bbox_flip(results[key],
+                                              results['img_shape'],
+                                              results['flip_direction'])
+            # flip masks
+            for key in results.get('mask_fields', []):
+                results[key] = results[key].flip(results['flip_direction'])
+
+            # flip segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
+
+
+@PIPELINES.register_module()
+class RandomShift:
+    """Shift the image and box given shift pixels and probability.
+
+    Args:
+        shift_ratio (float): Probability of shifts. Default 0.5.
+        max_shift_px (int): The max pixels for shifting. Default 32.
+        filter_thr_px (int): The width and height threshold for filtering.
+            The bbox and the rest of the targets below the width and
+            height threshold will be filtered. Default 1.
+    """
+
+    def __init__(self, shift_ratio=0.5, max_shift_px=32, filter_thr_px=1):
+        assert 0 <= shift_ratio <= 1
+        assert max_shift_px >= 0
+        self.shift_ratio = shift_ratio
+        self.max_shift_px = max_shift_px
+        self.filter_thr_px = int(filter_thr_px)
+        # The key correspondence from bboxes to labels.
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to random shift images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Shift results.
+        """
+        if random.random() < self.shift_ratio:
+            img_shape = results['img'].shape[:2]
+
+            random_shift_x = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            random_shift_y = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            new_x = max(0, random_shift_x)
+            ori_x = max(0, -random_shift_x)
+            new_y = max(0, random_shift_y)
+            ori_y = max(0, -random_shift_y)
+
+            # TODO: support mask and semantic segmentation maps.
+            for key in results.get('bbox_fields', []):
+                bboxes = results[key].copy()
+                bboxes[..., 0::2] += random_shift_x
+                bboxes[..., 1::2] += random_shift_y
+
+                # clip border
+                bboxes[..., 0::2] = np.clip(bboxes[..., 0::2], 0, img_shape[1])
+                bboxes[..., 1::2] = np.clip(bboxes[..., 1::2], 0, img_shape[0])
+
+                # remove invalid bboxes
+                bbox_w = bboxes[..., 2] - bboxes[..., 0]
+                bbox_h = bboxes[..., 3] - bboxes[..., 1]
+                valid_inds = (bbox_w > self.filter_thr_px) & (
+                    bbox_h > self.filter_thr_px)
+                # If the shift does not contain any gt-bbox area, skip this
+                # image.
+                if key == 'gt_bboxes' and not valid_inds.any():
+                    return results
+                bboxes = bboxes[valid_inds]
+                results[key] = bboxes
+
+                # label fields. e.g. gt_labels and gt_labels_ignore
+                label_key = self.bbox2label.get(key)
+                if label_key in results:
+                    results[label_key] = results[label_key][valid_inds]
+
+            for key in results.get('img_fields', ['img']):
+                img = results[key]
+                new_img = np.zeros_like(img)
+                img_h, img_w = img.shape[:2]
+                new_h = img_h - np.abs(random_shift_y)
+                new_w = img_w - np.abs(random_shift_x)
+                new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
+                    = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w]
+                results[key] = new_img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_shift_px={self.max_shift_px}, '
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Pad:
+    """Pad the image & masks & segmentation map.
+
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_to_square (bool): Whether to pad the image into a square.
+            Currently only used for YOLOX. Default: False.
+        pad_val (dict, optional): A dict for padding value, the default
+            value is `dict(img=0, masks=0, seg=255)`.
+    """
+
+    def __init__(self,
+                 size=None,
+                 size_divisor=None,
+                 pad_to_square=False,
+                 pad_val=dict(img=0, masks=0, seg=255)):
+        self.size = size
+        self.size_divisor = size_divisor
+        if isinstance(pad_val, float) or isinstance(pad_val, int):
+            warnings.warn(
+                'pad_val of float type is deprecated now, '
+                f'please use pad_val=dict(img={pad_val}, '
+                f'masks={pad_val}, seg=255) instead.', DeprecationWarning)
+            pad_val = dict(img=pad_val, masks=pad_val, seg=255)
+        assert isinstance(pad_val, dict)
+        self.pad_val = pad_val
+        self.pad_to_square = pad_to_square
+
+        if pad_to_square:
+            assert size is None and size_divisor is None, \
+                'The size and size_divisor must be None ' \
+                'when pad2square is True'
+        else:
+            assert size is not None or size_divisor is not None, \
+                'only one of size and size_divisor should be valid'
+            assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        pad_val = self.pad_val.get('img', 0)
+        for key in results.get('img_fields', ['img']):
+            if self.pad_to_square:
+                max_size = max(results[key].shape[:2])
+                self.size = (max_size, max_size)
+            if self.size is not None:
+                padded_img = mmcv.impad(
+                    results[key], shape=self.size, pad_val=pad_val)
+            elif self.size_divisor is not None:
+                padded_img = mmcv.impad_to_multiple(
+                    results[key], self.size_divisor, pad_val=pad_val)
+            results[key] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def _pad_masks(self, results):
+        """Pad masks according to ``results['pad_shape']``."""
+        pad_shape = results['pad_shape'][:2]
+        pad_val = self.pad_val.get('masks', 0)
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].pad(pad_shape, pad_val=pad_val)
+
+    def _pad_seg(self, results):
+        """Pad semantic segmentation map according to
+        ``results['pad_shape']``."""
+        pad_val = self.pad_val.get('seg', 255)
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.impad(
+                results[key], shape=results['pad_shape'][:2], pad_val=pad_val)
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_masks(results)
+        self._pad_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_to_square={self.pad_to_square}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Normalize:
+    """Normalize the image.
+
+    Added key is "img_norm_cfg".
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        for key in results.get('img_fields', ['img']):
+            results[key] = mmcv.imnormalize(results[key], self.mean, self.std,
+                                            self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCrop:
+    """Random crop the image & bboxes & masks.
+
+    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
+    then the cropped results are generated.
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            height and width.
+        crop_type (str, optional): one of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])]. Default "absolute".
+        allow_negative_crop (bool, optional): Whether to allow a crop that does
+            not contain any bbox area. Default False.
+        recompute_bbox (bool, optional): Whether to re-compute the boxes based
+            on cropped instance masks. Default False.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+            original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
+          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
+          `gt_masks_ignore`.
+        - If the crop does not contain any gt-bbox region and
+          `allow_negative_crop` is set to False, skip this image.
+    """
+
+    def __init__(self,
+                 crop_size,
+                 crop_type='absolute',
+                 allow_negative_crop=False,
+                 recompute_bbox=False,
+                 bbox_clip_border=True):
+        if crop_type not in [
+                'relative_range', 'relative', 'absolute', 'absolute_range'
+        ]:
+            raise ValueError(f'Invalid crop_type {crop_type}.')
+        if crop_type in ['absolute', 'absolute_range']:
+            assert crop_size[0] > 0 and crop_size[1] > 0
+            assert isinstance(crop_size[0], int) and isinstance(
+                crop_size[1], int)
+        else:
+            assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+        self.crop_size = crop_size
+        self.crop_type = crop_type
+        self.allow_negative_crop = allow_negative_crop
+        self.bbox_clip_border = bbox_clip_border
+        self.recompute_bbox = recompute_bbox
+        # The key correspondence from bboxes to labels and masks.
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def _crop_data(self, results, crop_size, allow_negative_crop):
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (tuple): Expected absolute size after cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area. Default to False.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            margin_h = max(img.shape[0] - crop_size[0], 0)
+            margin_w = max(img.shape[1] - crop_size[1], 0)
+            offset_h = np.random.randint(0, margin_h + 1)
+            offset_w = np.random.randint(0, margin_w + 1)
+            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+            # crop the image
+            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+            img_shape = img.shape
+            results[key] = img
+        results['img_shape'] = img_shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            # e.g. gt_bboxes and gt_bboxes_ignore
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            if self.bbox_clip_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+                bboxes[:, 3] > bboxes[:, 1])
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (key == 'gt_bboxes' and not valid_inds.any()
+                    and not allow_negative_crop):
+                return None
+            results[key] = bboxes[valid_inds, :]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = self.bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = self.bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results[key] = results[mask_key].get_bboxes()
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+        return results
+
+    def _get_crop_size(self, image_size):
+        """Randomly generates the absolute crop size based on `crop_type` and
+        `image_size`.
+
+        Args:
+            image_size (tuple): (h, w).
+
+        Returns:
+            crop_size (tuple): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        if self.crop_type == 'absolute':
+            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+        elif self.crop_type == 'absolute_range':
+            assert self.crop_size[0] <= self.crop_size[1]
+            crop_h = np.random.randint(
+                min(h, self.crop_size[0]),
+                min(h, self.crop_size[1]) + 1)
+            crop_w = np.random.randint(
+                min(w, self.crop_size[0]),
+                min(w, self.crop_size[1]) + 1)
+            return crop_h, crop_w
+        elif self.crop_type == 'relative':
+            crop_h, crop_w = self.crop_size
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+        elif self.crop_type == 'relative_range':
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+    def __call__(self, results):
+        """Call function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        image_size = results['img'].shape[:2]
+        crop_size = self._get_crop_size(image_size)
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class SegRescale:
+    """Rescale semantic segmentation maps.
+
+    Args:
+        scale_factor (float): The scale factor of the final output.
+        backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+    """
+
+    def __init__(self, scale_factor=1, backend='cv2'):
+        self.scale_factor = scale_factor
+        self.backend = backend
+
+    def __call__(self, results):
+        """Call function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+
+        for key in results.get('seg_fields', []):
+            if self.scale_factor != 1:
+                results[key] = mmcv.imrescale(
+                    results[key],
+                    self.scale_factor,
+                    interpolation='nearest',
+                    backend=self.backend)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortion:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        img = img.astype(np.float32)
+        # random brightness
+        if random.randint(2):
+            delta = random.uniform(-self.brightness_delta,
+                                   self.brightness_delta)
+            img += delta
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+
+        # random saturation
+        if random.randint(2):
+            img[..., 1] *= random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+
+        # random hue
+        if random.randint(2):
+            img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # randomly swap channels
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Expand:
+    """Random expand the image & bboxes.
+
+    Randomly place the original image on a canvas of 'ratio' x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+
+    Args:
+        mean (tuple): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (tuple): range of expand ratio.
+        prob (float): probability of applying this transformation
+    """
+
+    def __init__(self,
+                 mean=(0, 0, 0),
+                 to_rgb=True,
+                 ratio_range=(1, 4),
+                 seg_ignore_label=None,
+                 prob=0.5):
+        self.to_rgb = to_rgb
+        self.ratio_range = ratio_range
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+
+    def __call__(self, results):
+        """Call function to expand images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images, bounding boxes expanded
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+
+        h, w, c = img.shape
+        ratio = random.uniform(self.min_ratio, self.max_ratio)
+        # speedup expand when meets large image
+        if np.all(self.mean == self.mean[0]):
+            expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+                                  img.dtype)
+            expand_img.fill(self.mean[0])
+        else:
+            expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                                 self.mean,
+                                 dtype=img.dtype)
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        expand_img[top:top + h, left:left + w] = img
+
+        results['img'] = expand_img
+        # expand bboxes
+        for key in results.get('bbox_fields', []):
+            results[key] = results[key] + np.tile(
+                (left, top), 2).astype(results[key].dtype)
+
+        # expand masks
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].expand(
+                int(h * ratio), int(w * ratio), top, left)
+
+        # expand segs
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+                                    self.seg_ignore_label,
+                                    dtype=gt_seg.dtype)
+            expand_gt_seg[top:top + h, left:left + w] = gt_seg
+            results[key] = expand_gt_seg
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MinIoURandomCrop:
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        The keys for bboxes, labels and masks should be paired. That is, \
+        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+    """
+
+    def __init__(self,
+                 min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size=0.3,
+                 bbox_clip_border=True):
+        # 1: return ori img
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.bbox_clip_border = bbox_clip_border
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert 'bbox_fields' in results
+        boxes = [results[key] for key in results['bbox_fields']]
+        boxes = np.concatenate(boxes, 0)
+        h, w, c = img.shape
+        while True:
+            mode = random.choice(self.sample_mode)
+            self.mode = mode
+            if mode == 1:
+                return results
+
+            min_iou = mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = bbox_overlaps(
+                    patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                        mask = ((center[:, 0] > patch[0]) *
+                                (center[:, 1] > patch[1]) *
+                                (center[:, 0] < patch[2]) *
+                                (center[:, 1] < patch[3]))
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                    for key in results.get('bbox_fields', []):
+                        boxes = results[key].copy()
+                        mask = is_center_of_bboxes_in_patch(boxes, patch)
+                        boxes = boxes[mask]
+                        if self.bbox_clip_border:
+                            boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                            boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                        boxes -= np.tile(patch[:2], 2)
+
+                        results[key] = boxes
+                        # labels
+                        label_key = self.bbox2label.get(key)
+                        if label_key in results:
+                            results[label_key] = results[label_key][mask]
+
+                        # mask fields
+                        mask_key = self.bbox2mask.get(key)
+                        if mask_key in results:
+                            results[mask_key] = results[mask_key][
+                                mask.nonzero()[0]].crop(patch)
+                # adjust the img no matter whether the gt is empty before crop
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                results['img'] = img
+                results['img_shape'] = img.shape
+
+                # seg fields
+                for key in results.get('seg_fields', []):
+                    results[key] = results[key][patch[1]:patch[3],
+                                                patch[0]:patch[2]]
+                return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_ious}, '
+        repr_str += f'min_crop_size={self.min_crop_size}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Corrupt:
+    """Corruption augmentation.
+
+    Corruption transforms implemented based on
+    `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
+
+    Args:
+        corruption (str): Corruption name.
+        severity (int, optional): The severity of corruption. Default: 1.
+    """
+
+    def __init__(self, corruption, severity=1):
+        self.corruption = corruption
+        self.severity = severity
+
+    def __call__(self, results):
+        """Call function to corrupt image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images corrupted.
+        """
+
+        if corrupt is None:
+            raise RuntimeError('imagecorruptions is not installed')
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(corruption={self.corruption}, '
+        repr_str += f'severity={self.severity})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Albu:
+    """Albumentation augmentation.
+
+    Adds custom transformations from Albumentations library.
+    Please, visit `https://albumentations.readthedocs.io`
+    to get more information.
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block::
+
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        bbox_params (dict): Bbox_params for albumentation `Compose`
+        keymap (dict): Contains {'input key':'albumentation-style key'}
+        skip_img_without_anno (bool): Whether to skip the image if no ann left
+            after aug
+    """
+
+    def __init__(self,
+                 transforms,
+                 bbox_params=None,
+                 keymap=None,
+                 update_pad_shape=False,
+                 skip_img_without_anno=False):
+        if Compose is None:
+            raise RuntimeError('albumentations is not installed')
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+        if bbox_params is not None:
+            bbox_params = copy.deepcopy(bbox_params)
+        if keymap is not None:
+            keymap = copy.deepcopy(keymap)
+        self.transforms = transforms
+        self.filter_lost_elements = False
+        self.update_pad_shape = update_pad_shape
+        self.skip_img_without_anno = skip_img_without_anno
+
+        # A simple workaround to remove masks without boxes
+        if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+                and 'filter_lost_elements' in bbox_params):
+            self.filter_lost_elements = True
+            self.origin_label_fields = bbox_params['label_fields']
+            bbox_params['label_fields'] = ['idx_mapper']
+            del bbox_params['filter_lost_elements']
+
+        self.bbox_params = (
+            self.albu_builder(bbox_params) if bbox_params else None)
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           bbox_params=self.bbox_params)
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+                'gt_masks': 'masks',
+                'gt_bboxes': 'bboxes'
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg):
+        """Import a module from albumentations.
+
+        It inherits some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmcv.is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d, keymap):
+        """Dictionary mapper. Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {}
+        for k, v in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    def __call__(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+        # TODO: add bbox_fields
+        if 'bboxes' in results:
+            # to list of boxes
+            if isinstance(results['bboxes'], np.ndarray):
+                results['bboxes'] = [x for x in results['bboxes']]
+            # add pseudo-field for filtration
+            if self.filter_lost_elements:
+                results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+        # TODO: Support mask structure in albu
+        if 'masks' in results:
+            if isinstance(results['masks'], PolygonMasks):
+                raise NotImplementedError(
+                    'Albu only supports BitMap masks now')
+            ori_masks = results['masks']
+            if albumentations.__version__ < '0.5':
+                results['masks'] = results['masks'].masks
+            else:
+                results['masks'] = [mask for mask in results['masks'].masks]
+
+        results = self.aug(**results)
+
+        if 'bboxes' in results:
+            if isinstance(results['bboxes'], list):
+                results['bboxes'] = np.array(
+                    results['bboxes'], dtype=np.float32)
+            results['bboxes'] = results['bboxes'].reshape(-1, 4)
+
+            # filter label_fields
+            if self.filter_lost_elements:
+
+                for label in self.origin_label_fields:
+                    results[label] = np.array(
+                        [results[label][i] for i in results['idx_mapper']])
+                if 'masks' in results:
+                    results['masks'] = np.array(
+                        [results['masks'][i] for i in results['idx_mapper']])
+                    results['masks'] = ori_masks.__class__(
+                        results['masks'], results['image'].shape[0],
+                        results['image'].shape[1])
+
+                if (not len(results['idx_mapper'])
+                        and self.skip_img_without_anno):
+                    return None
+
+        if 'gt_labels' in results:
+            if isinstance(results['gt_labels'], list):
+                results['gt_labels'] = np.array(results['gt_labels'])
+            results['gt_labels'] = results['gt_labels'].astype(np.int64)
+
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        # update final shape
+        if self.update_pad_shape:
+            results['pad_shape'] = results['img'].shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCenterCropPad:
+    """Random center crop and random around padding for CornerNet.
+
+    This operation generates randomly cropped image from the original image and
+    pads it simultaneously. Different from :class:`RandomCrop`, the output
+    shape may not equal to ``crop_size`` strictly. We choose a random value
+    from ``ratios`` and the output shape could be larger or smaller than
+    ``crop_size``. The padding operation is also different from :class:`Pad`,
+    here we use around padding instead of right-bottom padding.
+
+    The relation between output image (padding image) and original image:
+
+    .. code:: text
+
+                        output image
+
+               +----------------------------+
+               |          padded area       |
+        +------|----------------------------|----------+
+        |      |         cropped area       |          |
+        |      |         +---------------+  |          |
+        |      |         |    .   center |  |          | original image
+        |      |         |        range  |  |          |
+        |      |         +---------------+  |          |
+        +------|----------------------------|----------+
+               |          padded area       |
+               +----------------------------+
+
+    There are 5 main areas in the figure:
+
+    - output image: output image of this operation, also called padding
+      image in following instruction.
+    - original image: input image of this operation.
+    - padded area: non-intersect area of output image and original image.
+    - cropped area: the overlap of output image and original image.
+    - center range: a smaller area where random center chosen from.
+      center range is computed by ``border`` and original image's shape
+      to avoid our random center is too close to original image's border.
+
+    Also this operation act differently in train and test mode, the summary
+    pipeline is listed below.
+
+    Train pipeline:
+
+    1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+       will be ``random_ratio * crop_size``.
+    2. Choose a ``random_center`` in center range.
+    3. Generate padding image with center matches the ``random_center``.
+    4. Initialize the padding image with pixel value equals to ``mean``.
+    5. Copy the cropped area to padding image.
+    6. Refine annotations.
+
+    Test pipeline:
+
+    1. Compute output shape according to ``test_pad_mode``.
+    2. Generate padding image with center matches the original image
+       center.
+    3. Initialize the padding image with pixel value equals to ``mean``.
+    4. Copy the ``cropped area`` to padding image.
+
+    Args:
+        crop_size (tuple | None): expected size after crop, final size will
+            computed according to ratio. Requires (h, w) in train mode, and
+            None in test mode.
+        ratios (tuple): random select a ratio from tuple and crop image to
+            (crop_size[0] * ratio) * (crop_size[1] * ratio).
+            Only available in train mode.
+        border (int): max distance from center select area to image border.
+            Only available in train mode.
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB.
+        test_mode (bool): whether involve random variables in transform.
+            In train mode, crop_size is fixed, center coords and ratio is
+            random selected from predefined lists. In test mode, crop_size
+            is image's original shape, center coords and ratio is fixed.
+        test_pad_mode (tuple): padding method and padding shape value, only
+            available in test mode. Default is using 'logical_or' with
+            127 as padding shape value.
+
+            - 'logical_or': final_shape = input_shape | padding_shape_value
+            - 'size_divisor': final_shape = int(
+              ceil(input_shape / padding_shape_value) * padding_shape_value)
+        test_pad_add_pix (int): Extra padding pixel in test mode. Default 0.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size=None,
+                 ratios=(0.9, 1.0, 1.1),
+                 border=128,
+                 mean=None,
+                 std=None,
+                 to_rgb=None,
+                 test_mode=False,
+                 test_pad_mode=('logical_or', 127),
+                 test_pad_add_pix=0,
+                 bbox_clip_border=True):
+        if test_mode:
+            assert crop_size is None, 'crop_size must be None in test mode'
+            assert ratios is None, 'ratios must be None in test mode'
+            assert border is None, 'border must be None in test mode'
+            assert isinstance(test_pad_mode, (list, tuple))
+            assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+        else:
+            assert isinstance(crop_size, (list, tuple))
+            assert crop_size[0] > 0 and crop_size[1] > 0, (
+                'crop_size must > 0 in train mode')
+            assert isinstance(ratios, (list, tuple))
+            assert test_pad_mode is None, (
+                'test_pad_mode must be None in train mode')
+
+        self.crop_size = crop_size
+        self.ratios = ratios
+        self.border = border
+        # We do not set default value to mean, std and to_rgb because these
+        # hyper-parameters are easy to forget but could affect the performance.
+        # Please use the same setting as Normalize for performance assurance.
+        assert mean is not None and std is not None and to_rgb is not None
+        self.to_rgb = to_rgb
+        self.input_mean = mean
+        self.input_std = std
+        if to_rgb:
+            self.mean = mean[::-1]
+            self.std = std[::-1]
+        else:
+            self.mean = mean
+            self.std = std
+        self.test_mode = test_mode
+        self.test_pad_mode = test_pad_mode
+        self.test_pad_add_pix = test_pad_add_pix
+        self.bbox_clip_border = bbox_clip_border
+
+    def _get_border(self, border, size):
+        """Get final border for the target size.
+
+        This function generates a ``final_border`` according to image's shape.
+        The area between ``final_border`` and ``size - final_border`` is the
+        ``center range``. We randomly choose center from the ``center range``
+        to avoid our random center is too close to original image's border.
+        Also ``center range`` should be larger than 0.
+
+        Args:
+            border (int): The initial border, default is 128.
+            size (int): The width or height of original image.
+        Returns:
+            int: The final border.
+        """
+        k = 2 * border / size
+        i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+        return border // i
+
+    def _filter_boxes(self, patch, boxes):
+        """Check whether the center of each box is in the patch.
+
+        Args:
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+            boxes (numpy array, (N x 4)): Ground truth boxes.
+
+        Returns:
+            mask (numpy array, (N,)): Each box is inside or outside the patch.
+        """
+        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+        mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+            center[:, 0] < patch[2]) * (
+                center[:, 1] < patch[3])
+        return mask
+
+    def _crop_image_and_paste(self, image, center, size):
+        """Crop image with a given center and size, then paste the cropped
+        image to a blank image with two centers align.
+
+        This function is equivalent to generating a blank image with ``size``
+        as its shape. Then cover it on the original image with two centers (
+        the center of blank image and the random center of original image)
+        aligned. The overlap area is paste from the original image and the
+        outside area is filled with ``mean pixel``.
+
+        Args:
+            image (np array, H x W x C): Original image.
+            center (list[int]): Target crop center coord.
+            size (list[int]): Target crop size. [target_h, target_w]
+
+        Returns:
+            cropped_img (np array, target_h x target_w x C): Cropped image.
+            border (np array, 4): The distance of four border of
+                ``cropped_img`` to the original image area, [top, bottom,
+                left, right]
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+        """
+        center_y, center_x = center
+        target_h, target_w = size
+        img_h, img_w, img_c = image.shape
+
+        x0 = max(0, center_x - target_w // 2)
+        x1 = min(center_x + target_w // 2, img_w)
+        y0 = max(0, center_y - target_h // 2)
+        y1 = min(center_y + target_h // 2, img_h)
+        patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+        left, right = center_x - x0, x1 - center_x
+        top, bottom = center_y - y0, y1 - center_y
+
+        cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+        cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+        for i in range(img_c):
+            cropped_img[:, :, i] += self.mean[i]
+        y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+        x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+        cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+        border = np.array([
+            cropped_center_y - top, cropped_center_y + bottom,
+            cropped_center_x - left, cropped_center_x + right
+        ],
+                          dtype=np.float32)
+
+        return cropped_img, border, patch
+
+    def _train_aug(self, results):
+        """Random crop and around padding the original image.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        boxes = results['gt_bboxes']
+        while True:
+            scale = random.choice(self.ratios)
+            new_h = int(self.crop_size[0] * scale)
+            new_w = int(self.crop_size[1] * scale)
+            h_border = self._get_border(self.border, h)
+            w_border = self._get_border(self.border, w)
+
+            for i in range(50):
+                center_x = random.randint(low=w_border, high=w - w_border)
+                center_y = random.randint(low=h_border, high=h - h_border)
+
+                cropped_img, border, patch = self._crop_image_and_paste(
+                    img, [center_y, center_x], [new_h, new_w])
+
+                mask = self._filter_boxes(patch, boxes)
+                # if image do not have valid bbox, any crop patch is valid.
+                if not mask.any() and len(boxes) > 0:
+                    continue
+
+                results['img'] = cropped_img
+                results['img_shape'] = cropped_img.shape
+                results['pad_shape'] = cropped_img.shape
+
+                x0, y0, x1, y1 = patch
+
+                left_w, top_h = center_x - x0, center_y - y0
+                cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+                # crop bboxes accordingly and clip to the image boundary
+                for key in results.get('bbox_fields', []):
+                    mask = self._filter_boxes(patch, results[key])
+                    bboxes = results[key][mask]
+                    bboxes[:, 0:4:2] += cropped_center_x - left_w - x0
+                    bboxes[:, 1:4:2] += cropped_center_y - top_h - y0
+                    if self.bbox_clip_border:
+                        bboxes[:, 0:4:2] = np.clip(bboxes[:, 0:4:2], 0, new_w)
+                        bboxes[:, 1:4:2] = np.clip(bboxes[:, 1:4:2], 0, new_h)
+                    keep = (bboxes[:, 2] > bboxes[:, 0]) & (
+                        bboxes[:, 3] > bboxes[:, 1])
+                    bboxes = bboxes[keep]
+                    results[key] = bboxes
+                    if key in ['gt_bboxes']:
+                        if 'gt_labels' in results:
+                            labels = results['gt_labels'][mask]
+                            labels = labels[keep]
+                            results['gt_labels'] = labels
+                        if 'gt_masks' in results:
+                            raise NotImplementedError(
+                                'RandomCenterCropPad only supports bbox.')
+
+                # crop semantic seg
+                for key in results.get('seg_fields', []):
+                    raise NotImplementedError(
+                        'RandomCenterCropPad only supports bbox.')
+                return results
+
+    def _test_aug(self, results):
+        """Around padding the original image without cropping.
+
+        The padding mode and value are from ``test_pad_mode``.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        results['img_shape'] = img.shape
+        if self.test_pad_mode[0] in ['logical_or']:
+            # self.test_pad_add_pix is only used for centernet
+            target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
+            target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
+        elif self.test_pad_mode[0] in ['size_divisor']:
+            divisor = self.test_pad_mode[1]
+            target_h = int(np.ceil(h / divisor)) * divisor
+            target_w = int(np.ceil(w / divisor)) * divisor
+        else:
+            raise NotImplementedError(
+                'RandomCenterCropPad only support two testing pad mode:'
+                'logical-or and size_divisor.')
+
+        cropped_img, border, _ = self._crop_image_and_paste(
+            img, [h // 2, w // 2], [target_h, target_w])
+        results['img'] = cropped_img
+        results['pad_shape'] = cropped_img.shape
+        results['border'] = border
+        return results
+
+    def __call__(self, results):
+        img = results['img']
+        assert img.dtype == np.float32, (
+            'RandomCenterCropPad needs the input image of dtype np.float32,'
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+        h, w, c = img.shape
+        assert c == len(self.mean)
+        if self.test_mode:
+            return self._test_aug(results)
+        else:
+            return self._train_aug(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'ratios={self.ratios}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'mean={self.input_mean}, '
+        repr_str += f'std={self.input_std}, '
+        repr_str += f'to_rgb={self.to_rgb}, '
+        repr_str += f'test_mode={self.test_mode}, '
+        repr_str += f'test_pad_mode={self.test_pad_mode}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CutOut:
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Args:
+        n_holes (int | tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+        cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+            shape of dropped regions. It can be `tuple[int, int]` to use a
+            fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+            shape from the list.
+        cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+            candidate ratio of dropped regions. It can be `tuple[float, float]`
+            to use a fixed ratio or `list[tuple[float, float]]` to randomly
+            choose ratio from the list. Please note that `cutout_shape`
+            and `cutout_ratio` cannot be both given at the same time.
+        fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Default: (0, 0, 0).
+    """
+
+    def __init__(self,
+                 n_holes,
+                 cutout_shape=None,
+                 cutout_ratio=None,
+                 fill_in=(0, 0, 0)):
+
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    def __call__(self, results):
+        """Call function to drop some regions of image."""
+        h, w, c = results['img'].shape
+        n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        for _ in range(n_holes):
+            x1 = np.random.randint(0, w)
+            y1 = np.random.randint(0, h)
+            index = np.random.randint(0, len(self.candidates))
+            if not self.with_ratio:
+                cutout_w, cutout_h = self.candidates[index]
+            else:
+                cutout_w = int(self.candidates[index][0] * w)
+                cutout_h = int(self.candidates[index][1] * h)
+
+            x2 = np.clip(x1 + cutout_w, 0, w)
+            y2 = np.clip(y1 + cutout_h, 0, h)
+            results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Mosaic:
+    """Mosaic augmentation.
+
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (height, width).
+            Default to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Default to (0.5, 1.5).
+        min_bbox_size (int | float): The minimum pixel for filtering
+            invalid bboxes after the mosaic pipeline. Default to 0.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        skip_filter (bool): Whether to skip filtering rules. If it
+            is True, the filter rule will not be applied, and the
+            `min_bbox_size` is invalid. Default to True.
+        pad_val (int): Pad value. Default to 114.
+        prob (float): Probability of applying this transformation.
+            Default to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale=(640, 640),
+                 center_ratio_range=(0.5, 1.5),
+                 min_bbox_size=0,
+                 bbox_clip_border=True,
+                 skip_filter=True,
+                 pad_val=114,
+                 prob=1.0):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
+            f'got {prob}.'
+
+        log_img_scale(img_scale, skip_square=True)
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.min_bbox_size = min_bbox_size
+        self.bbox_clip_border = bbox_clip_border
+        self.skip_filter = skip_filter
+        self.pad_val = pad_val
+        self.prob = prob
+
+    def __call__(self, results):
+        """Call function to make a mosaic of image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with mosaic transformed.
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        results = self._mosaic_transform(results)
+        return results
+
+    def get_indexes(self, dataset):
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    def _mosaic_transform(self, results):
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        mosaic_labels = []
+        mosaic_bboxes = []
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[0] / h_i,
+                                self.img_scale[1] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_labels_i = results_patch['gt_labels']
+
+            if gt_bboxes_i.shape[0] > 0:
+                padw = x1_p - x1_c
+                padh = y1_p - y1_c
+                gt_bboxes_i[:, 0::2] = \
+                    scale_ratio_i * gt_bboxes_i[:, 0::2] + padw
+                gt_bboxes_i[:, 1::2] = \
+                    scale_ratio_i * gt_bboxes_i[:, 1::2] + padh
+
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_labels.append(gt_labels_i)
+
+        if len(mosaic_labels) > 0:
+            mosaic_bboxes = np.concatenate(mosaic_bboxes, 0)
+            mosaic_labels = np.concatenate(mosaic_labels, 0)
+
+            if self.bbox_clip_border:
+                mosaic_bboxes[:, 0::2] = np.clip(mosaic_bboxes[:, 0::2], 0,
+                                                 2 * self.img_scale[1])
+                mosaic_bboxes[:, 1::2] = np.clip(mosaic_bboxes[:, 1::2], 0,
+                                                 2 * self.img_scale[0])
+
+            if not self.skip_filter:
+                mosaic_bboxes, mosaic_labels = \
+                    self._filter_box_candidates(mosaic_bboxes, mosaic_labels)
+
+        # remove outside bboxes
+        inside_inds = find_inside_bboxes(mosaic_bboxes, 2 * self.img_scale[0],
+                                         2 * self.img_scale[1])
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_labels = mosaic_labels[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_labels'] = mosaic_labels
+
+        return results
+
+    def _mosaic_combine(self, loc, center_position_xy, img_shape_wh):
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def _filter_box_candidates(self, bboxes, labels):
+        """Filter out bboxes too small after Mosaic."""
+        bbox_w = bboxes[:, 2] - bboxes[:, 0]
+        bbox_h = bboxes[:, 3] - bboxes[:, 1]
+        valid_inds = (bbox_w > self.min_bbox_size) & \
+                     (bbox_h > self.min_bbox_size)
+        valid_inds = np.nonzero(valid_inds)[0]
+        return bboxes[valid_inds], labels[valid_inds]
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'min_bbox_size={self.min_bbox_size}, '
+        repr_str += f'skip_filter={self.skip_filter})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MixUp:
+    """MixUp data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (height, width). Default: (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Default: (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Default: 0.5.
+        pad_val (int): Pad value. Default: 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Default: 15.
+        min_bbox_size (float): Width and height threshold to filter bboxes.
+            If the height or width of a box is smaller than this value, it
+            will be removed. Default: 5.
+        min_area_ratio (float): Threshold of area ratio between
+            original bboxes and wrapped bboxes. If smaller than this value,
+            the box will be removed. Default: 0.2.
+        max_aspect_ratio (float): Aspect ratio of width and height
+            threshold to filter bboxes. If max(h/w, w/h) larger than this
+            value, the box will be removed. Default: 20.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        skip_filter (bool): Whether to skip filtering rules. If it
+            is True, the filter rule will not be applied, and the
+            `min_bbox_size` and `min_area_ratio` and `max_aspect_ratio`
+            is invalid. Default to True.
+    """
+
+    def __init__(self,
+                 img_scale=(640, 640),
+                 ratio_range=(0.5, 1.5),
+                 flip_ratio=0.5,
+                 pad_val=114,
+                 max_iters=15,
+                 min_bbox_size=5,
+                 min_area_ratio=0.2,
+                 max_aspect_ratio=20,
+                 bbox_clip_border=True,
+                 skip_filter=True):
+        assert isinstance(img_scale, tuple)
+        log_img_scale(img_scale, skip_square=True)
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.min_bbox_size = min_bbox_size
+        self.min_area_ratio = min_area_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.bbox_clip_border = bbox_clip_border
+        self.skip_filter = skip_filter
+
+    def __call__(self, results):
+        """Call function to make a mixup of image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with mixup transformed.
+        """
+
+        results = self._mixup_transform(results)
+        return results
+
+    def get_indexes(self, dataset):
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(dataset))
+            gt_bboxes_i = dataset.get_ann_info(index)['bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+
+        return index
+
+    def _mixup_transform(self, results):
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_filp = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[0], self.dynamic_scale[1], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale, dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[0] / retrieve_img.shape[0],
+                          self.dynamic_scale[1] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_filp:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w,
+                                          target_w), 3)).astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes[:, 0::2] = retrieve_gt_bboxes[:, 0::2] * scale_ratio
+        retrieve_gt_bboxes[:, 1::2] = retrieve_gt_bboxes[:, 1::2] * scale_ratio
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes[:, 0::2] = np.clip(retrieve_gt_bboxes[:, 0::2],
+                                                  0, origin_w)
+            retrieve_gt_bboxes[:, 1::2] = np.clip(retrieve_gt_bboxes[:, 1::2],
+                                                  0, origin_h)
+
+        if is_filp:
+            retrieve_gt_bboxes[:, 0::2] = (
+                origin_w - retrieve_gt_bboxes[:, 0::2][:, ::-1])
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.copy()
+        cp_retrieve_gt_bboxes[:, 0::2] = \
+            cp_retrieve_gt_bboxes[:, 0::2] - x_offset
+        cp_retrieve_gt_bboxes[:, 1::2] = \
+            cp_retrieve_gt_bboxes[:, 1::2] - y_offset
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes[:, 0::2] = np.clip(
+                cp_retrieve_gt_bboxes[:, 0::2], 0, target_w)
+            cp_retrieve_gt_bboxes[:, 1::2] = np.clip(
+                cp_retrieve_gt_bboxes[:, 1::2], 0, target_h)
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_labels = retrieve_results['gt_labels']
+        if not self.skip_filter:
+            keep_list = self._filter_box_candidates(retrieve_gt_bboxes.T,
+                                                    cp_retrieve_gt_bboxes.T)
+
+            retrieve_gt_labels = retrieve_gt_labels[keep_list]
+            cp_retrieve_gt_bboxes = cp_retrieve_gt_bboxes[keep_list]
+
+        mixup_gt_bboxes = np.concatenate(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), axis=0)
+        mixup_gt_labels = np.concatenate(
+            (results['gt_labels'], retrieve_gt_labels), axis=0)
+
+        # remove outside bbox
+        inside_inds = find_inside_bboxes(mixup_gt_bboxes, target_h, target_w)
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_labels = mixup_gt_labels[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_labels'] = mixup_gt_labels
+
+        return results
+
+    def _filter_box_candidates(self, bbox1, bbox2):
+        """Compute candidate boxes which include following 5 things:
+
+        bbox1 before augment, bbox2 after augment, min_bbox_size (pixels),
+        min_area_ratio, max_aspect_ratio.
+        """
+
+        w1, h1 = bbox1[2] - bbox1[0], bbox1[3] - bbox1[1]
+        w2, h2 = bbox2[2] - bbox2[0], bbox2[3] - bbox2[1]
+        ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))
+        return ((w2 > self.min_bbox_size)
+                & (h2 > self.min_bbox_size)
+                & (w2 * h2 / (w1 * h1 + 1e-16) > self.min_area_ratio)
+                & (ar < self.max_aspect_ratio))
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'min_bbox_size={self.min_bbox_size}, '
+        repr_str += f'min_area_ratio={self.min_area_ratio}, '
+        repr_str += f'max_aspect_ratio={self.max_aspect_ratio}, '
+        repr_str += f'skip_filter={self.skip_filter})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomAffine:
+    """Random affine transform data augmentation.
+
+    This operation randomly generates affine transform matrix which including
+    rotation, translation, shear and scaling transforms.
+
+    Args:
+        max_rotate_degree (float): Maximum degrees of rotation transform.
+            Default: 10.
+        max_translate_ratio (float): Maximum ratio of translation.
+            Default: 0.1.
+        scaling_ratio_range (tuple[float]): Min and max ratio of
+            scaling transform. Default: (0.5, 1.5).
+        max_shear_degree (float): Maximum degrees of shear
+            transform. Default: 2.
+        border (tuple[int]): Distance from height and width sides of input
+            image to adjust output shape. Only used in mosaic dataset.
+            Default: (0, 0).
+        border_val (tuple[int]): Border padding values of 3 channels.
+            Default: (114, 114, 114).
+        min_bbox_size (float): Width and height threshold to filter bboxes.
+            If the height or width of a box is smaller than this value, it
+            will be removed. Default: 2.
+        min_area_ratio (float): Threshold of area ratio between
+            original bboxes and wrapped bboxes. If smaller than this value,
+            the box will be removed. Default: 0.2.
+        max_aspect_ratio (float): Aspect ratio of width and height
+            threshold to filter bboxes. If max(h/w, w/h) larger than this
+            value, the box will be removed.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        skip_filter (bool): Whether to skip filtering rules. If it
+            is True, the filter rule will not be applied, and the
+            `min_bbox_size` and `min_area_ratio` and `max_aspect_ratio`
+            is invalid. Default to True.
+    """
+
+    def __init__(self,
+                 max_rotate_degree=10.0,
+                 max_translate_ratio=0.1,
+                 scaling_ratio_range=(0.5, 1.5),
+                 max_shear_degree=2.0,
+                 border=(0, 0),
+                 border_val=(114, 114, 114),
+                 min_bbox_size=2,
+                 min_area_ratio=0.2,
+                 max_aspect_ratio=20,
+                 bbox_clip_border=True,
+                 skip_filter=True):
+        assert 0 <= max_translate_ratio <= 1
+        assert scaling_ratio_range[0] <= scaling_ratio_range[1]
+        assert scaling_ratio_range[0] > 0
+        self.max_rotate_degree = max_rotate_degree
+        self.max_translate_ratio = max_translate_ratio
+        self.scaling_ratio_range = scaling_ratio_range
+        self.max_shear_degree = max_shear_degree
+        self.border = border
+        self.border_val = border_val
+        self.min_bbox_size = min_bbox_size
+        self.min_area_ratio = min_area_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.bbox_clip_border = bbox_clip_border
+        self.skip_filter = skip_filter
+
+    def __call__(self, results):
+        img = results['img']
+        height = img.shape[0] + self.border[0] * 2
+        width = img.shape[1] + self.border[1] * 2
+
+        # Rotation
+        rotation_degree = random.uniform(-self.max_rotate_degree,
+                                         self.max_rotate_degree)
+        rotation_matrix = self._get_rotation_matrix(rotation_degree)
+
+        # Scaling
+        scaling_ratio = random.uniform(self.scaling_ratio_range[0],
+                                       self.scaling_ratio_range[1])
+        scaling_matrix = self._get_scaling_matrix(scaling_ratio)
+
+        # Shear
+        x_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        y_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        shear_matrix = self._get_shear_matrix(x_degree, y_degree)
+
+        # Translation
+        trans_x = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * width
+        trans_y = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * height
+        translate_matrix = self._get_translation_matrix(trans_x, trans_y)
+
+        warp_matrix = (
+            translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
+
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key]
+            num_bboxes = len(bboxes)
+            if num_bboxes:
+                # homogeneous coordinates
+                xs = bboxes[:, [0, 0, 2, 2]].reshape(num_bboxes * 4)
+                ys = bboxes[:, [1, 3, 3, 1]].reshape(num_bboxes * 4)
+                ones = np.ones_like(xs)
+                points = np.vstack([xs, ys, ones])
+
+                warp_points = warp_matrix @ points
+                warp_points = warp_points[:2] / warp_points[2]
+                xs = warp_points[0].reshape(num_bboxes, 4)
+                ys = warp_points[1].reshape(num_bboxes, 4)
+
+                warp_bboxes = np.vstack(
+                    (xs.min(1), ys.min(1), xs.max(1), ys.max(1))).T
+
+                if self.bbox_clip_border:
+                    warp_bboxes[:, [0, 2]] = \
+                        warp_bboxes[:, [0, 2]].clip(0, width)
+                    warp_bboxes[:, [1, 3]] = \
+                        warp_bboxes[:, [1, 3]].clip(0, height)
+
+                # remove outside bbox
+                valid_index = find_inside_bboxes(warp_bboxes, height, width)
+                if not self.skip_filter:
+                    # filter bboxes
+                    filter_index = self.filter_gt_bboxes(
+                        bboxes * scaling_ratio, warp_bboxes)
+                    valid_index = valid_index & filter_index
+
+                results[key] = warp_bboxes[valid_index]
+                if key in ['gt_bboxes']:
+                    if 'gt_labels' in results:
+                        results['gt_labels'] = results['gt_labels'][
+                            valid_index]
+
+                if 'gt_masks' in results:
+                    raise NotImplementedError(
+                        'RandomAffine only supports bbox.')
+        return results
+
+    def filter_gt_bboxes(self, origin_bboxes, wrapped_bboxes):
+        origin_w = origin_bboxes[:, 2] - origin_bboxes[:, 0]
+        origin_h = origin_bboxes[:, 3] - origin_bboxes[:, 1]
+        wrapped_w = wrapped_bboxes[:, 2] - wrapped_bboxes[:, 0]
+        wrapped_h = wrapped_bboxes[:, 3] - wrapped_bboxes[:, 1]
+        aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16),
+                                  wrapped_h / (wrapped_w + 1e-16))
+
+        wh_valid_idx = (wrapped_w > self.min_bbox_size) & \
+                       (wrapped_h > self.min_bbox_size)
+        area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h +
+                                                  1e-16) > self.min_area_ratio
+        aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio
+        return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
+        repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
+        repr_str += f'scaling_ratio={self.scaling_ratio_range}, '
+        repr_str += f'max_shear_degree={self.max_shear_degree}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'border_val={self.border_val}, '
+        repr_str += f'min_bbox_size={self.min_bbox_size}, '
+        repr_str += f'min_area_ratio={self.min_area_ratio}, '
+        repr_str += f'max_aspect_ratio={self.max_aspect_ratio}, '
+        repr_str += f'skip_filter={self.skip_filter})'
+        return repr_str
+
+    @staticmethod
+    def _get_rotation_matrix(rotate_degrees):
+        radian = math.radians(rotate_degrees)
+        rotation_matrix = np.array(
+            [[np.cos(radian), -np.sin(radian), 0.],
+             [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return rotation_matrix
+
+    @staticmethod
+    def _get_scaling_matrix(scale_ratio):
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_share_matrix(scale_ratio):
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_shear_matrix(x_shear_degrees, y_shear_degrees):
+        x_radian = math.radians(x_shear_degrees)
+        y_radian = math.radians(y_shear_degrees)
+        shear_matrix = np.array([[1, np.tan(x_radian), 0.],
+                                 [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
+                                dtype=np.float32)
+        return shear_matrix
+
+    @staticmethod
+    def _get_translation_matrix(x, y):
+        translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
+                                      dtype=np.float32)
+        return translation_matrix
+
+
+@PIPELINES.register_module()
+class YOLOXHSVRandomAug:
+    """Apply HSV augmentation to image sequentially. It is referenced from
+    https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21.
+
+    Args:
+        hue_delta (int): delta of hue. Default: 5.
+        saturation_delta (int): delta of saturation. Default: 30.
+        value_delta (int): delat of value. Default: 30.
+    """
+
+    def __init__(self, hue_delta=5, saturation_delta=30, value_delta=30):
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+
+    def __call__(self, results):
+        img = results['img']
+        hsv_gains = np.random.uniform(-1, 1, 3) * [
+            self.hue_delta, self.saturation_delta, self.value_delta
+        ]
+        # random selection of h, s, v
+        hsv_gains *= np.random.randint(0, 2, 3)
+        # prevent overflow
+        hsv_gains = hsv_gains.astype(np.int16)
+        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+        img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
+        img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
+        img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
+        cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CopyPaste:
+    """Simple Copy-Paste is a Strong Data Augmentation Method for Instance
+    Segmentation The simple copy-paste transform steps are as follows:
+
+    1. The destination image is already resized with aspect ratio kept,
+       cropped and padded.
+    2. Randomly select a source image, which is also already resized
+       with aspect ratio kept, cropped and padded in a similar way
+       as the destination image.
+    3. Randomly select some objects from the source image.
+    4. Paste these source objects to the destination image directly,
+       due to the source and destination image have the same size.
+    5. Update object masks of the destination image, for some origin objects
+       may be occluded.
+    6. Generate bboxes from the updated destination masks and
+       filter some objects which are totally occluded, and adjust bboxes
+       which are partly occluded.
+    7. Append selected source bboxes, masks, and labels.
+
+    Args:
+        max_num_pasted (int): The maximum number of pasted objects.
+            Default: 100.
+        bbox_occluded_thr (int): The threshold of occluded bbox.
+            Default: 10.
+        mask_occluded_thr (int): The threshold of occluded mask.
+            Default: 300.
+        selected (bool): Whether select objects or not. If select is False,
+            all objects of the source image will be pasted to the
+            destination image.
+            Default: True.
+    """
+
+    def __init__(
+        self,
+        max_num_pasted=100,
+        bbox_occluded_thr=10,
+        mask_occluded_thr=300,
+        selected=True,
+    ):
+        self.max_num_pasted = max_num_pasted
+        self.bbox_occluded_thr = bbox_occluded_thr
+        self.mask_occluded_thr = mask_occluded_thr
+        self.selected = selected
+
+    def get_indexes(self, dataset):
+        """Call function to collect indexes.s.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+        Returns:
+            list: Indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    def __call__(self, results):
+        """Call function to make a copy-paste of image.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Result dict with copy-paste transformed.
+        """
+
+        assert 'mix_results' in results
+        num_images = len(results['mix_results'])
+        assert num_images == 1, \
+            f'CopyPaste only supports processing 2 images, got {num_images}'
+        if self.selected:
+            selected_results = self._select_object(results['mix_results'][0])
+        else:
+            selected_results = results['mix_results'][0]
+        return self._copy_paste(results, selected_results)
+
+    def _select_object(self, results):
+        """Select some objects from the source results."""
+        bboxes = results['gt_bboxes']
+        labels = results['gt_labels']
+        masks = results['gt_masks']
+        max_num_pasted = min(bboxes.shape[0] + 1, self.max_num_pasted)
+        num_pasted = np.random.randint(0, max_num_pasted)
+        selected_inds = np.random.choice(
+            bboxes.shape[0], size=num_pasted, replace=False)
+
+        selected_bboxes = bboxes[selected_inds]
+        selected_labels = labels[selected_inds]
+        selected_masks = masks[selected_inds]
+
+        results['gt_bboxes'] = selected_bboxes
+        results['gt_labels'] = selected_labels
+        results['gt_masks'] = selected_masks
+        return results
+
+    def _copy_paste(self, dst_results, src_results):
+        """CopyPaste transform function.
+
+        Args:
+            dst_results (dict): Result dict of the destination image.
+            src_results (dict): Result dict of the source image.
+        Returns:
+            dict: Updated result dict.
+        """
+        dst_img = dst_results['img']
+        dst_bboxes = dst_results['gt_bboxes']
+        dst_labels = dst_results['gt_labels']
+        dst_masks = dst_results['gt_masks']
+
+        src_img = src_results['img']
+        src_bboxes = src_results['gt_bboxes']
+        src_labels = src_results['gt_labels']
+        src_masks = src_results['gt_masks']
+
+        if len(src_bboxes) == 0:
+            return dst_results
+
+        # update masks and generate bboxes from updated masks
+        composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0)
+        updated_dst_masks = self.get_updated_masks(dst_masks, composed_mask)
+        updated_dst_bboxes = updated_dst_masks.get_bboxes()
+        assert len(updated_dst_bboxes) == len(updated_dst_masks)
+
+        # filter totally occluded objects
+        bboxes_inds = np.all(
+            np.abs(
+                (updated_dst_bboxes - dst_bboxes)) <= self.bbox_occluded_thr,
+            axis=-1)
+        masks_inds = updated_dst_masks.masks.sum(
+            axis=(1, 2)) > self.mask_occluded_thr
+        valid_inds = bboxes_inds | masks_inds
+
+        # Paste source objects to destination image directly
+        img = dst_img * (1 - composed_mask[..., np.newaxis]
+                         ) + src_img * composed_mask[..., np.newaxis]
+        bboxes = np.concatenate([updated_dst_bboxes[valid_inds], src_bboxes])
+        labels = np.concatenate([dst_labels[valid_inds], src_labels])
+        masks = np.concatenate(
+            [updated_dst_masks.masks[valid_inds], src_masks.masks])
+
+        dst_results['img'] = img
+        dst_results['gt_bboxes'] = bboxes
+        dst_results['gt_labels'] = labels
+        dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1],
+                                              masks.shape[2])
+
+        return dst_results
+
+    def get_updated_masks(self, masks, composed_mask):
+        assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \
+            'Cannot compare two arrays of different size'
+        masks.masks = np.where(composed_mask, 0, masks.masks)
+        return masks
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'max_num_pasted={self.max_num_pasted}, '
+        repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, '
+        repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, '
+        repr_str += f'selected={self.selected}, '
+        return repr_str
+
+@PIPELINES.register_module()
+class RGB2Gray(object):
+    """Convert RGB image to grayscale image.
+
+    This transform calculate the weighted mean of input image channels with
+    ``weights`` and then expand the channels to ``out_channels``. When
+    ``out_channels`` is None, the number of output channels is the same as
+    input channels.
+
+    Args:
+        out_channels (int): Expected number of output channels after
+            transforming. Default: None.
+        weights (tuple[float]): The weights to calculate the weighted mean.
+            Default: (0.299, 0.587, 0.114).
+    """
+
+    def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)):
+        assert out_channels is None or out_channels > 0
+        self.out_channels = out_channels
+        assert isinstance(weights, tuple)
+        for item in weights:
+            assert isinstance(item, (float, int))
+        self.weights = weights
+
+    def __call__(self, results):
+        """Call function to convert RGB image to grayscale image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with grayscale image.
+        """
+        img = results['img']
+        assert len(img.shape) == 3
+        assert img.shape[2] == len(self.weights)
+        weights = np.array(self.weights).reshape((1, 1, -1))
+        img = (img * weights).sum(2, keepdims=True)
+        if self.out_channels is None:
+            img = img.repeat(weights.shape[2], axis=2)
+        else:
+            img = img.repeat(self.out_channels, axis=2)
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(out_channels={self.out_channels}, ' \
+                    f'weights={self.weights})'
+        return repr_str
\ No newline at end of file
diff --git a/mmdet/datasets/samplers/__init__.py b/mmdet/datasets/samplers/__init__.py
new file mode 100755
index 0000000..a4c7ea1
--- /dev/null
+++ b/mmdet/datasets/samplers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .class_aware_sampler import ClassAwareSampler
+from .distributed_sampler import DistributedSampler
+from .group_sampler import DistributedGroupSampler, GroupSampler
+from .infinite_sampler import InfiniteBatchSampler, InfiniteGroupBatchSampler
+
+__all__ = [
+    'DistributedSampler', 'DistributedGroupSampler', 'GroupSampler',
+    'InfiniteGroupBatchSampler', 'InfiniteBatchSampler', 'ClassAwareSampler'
+]
diff --git a/mmdet/datasets/samplers/class_aware_sampler.py b/mmdet/datasets/samplers/class_aware_sampler.py
new file mode 100755
index 0000000..c52708e
--- /dev/null
+++ b/mmdet/datasets/samplers/class_aware_sampler.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+
+from mmdet.core.utils import sync_random_seed
+
+
+class ClassAwareSampler(Sampler):
+    r"""Sampler that restricts data loading to the label of the dataset.
+
+    A class-aware sampling strategy to effectively tackle the
+    non-uniform class distribution. The length of the training data is
+    consistent with source data. Simple improvements based on `Relay
+    Backpropagation for Effective Learning of Deep Convolutional
+    Neural Networks <https://arxiv.org/abs/1512.05830>`_
+
+    The implementation logic is referred to
+    https://github.com/Sense-X/TSD/blob/master/mmdet/datasets/samplers/distributed_classaware_sampler.py
+
+    Args:
+        dataset: Dataset used for sampling.
+        samples_per_gpu (int): When model is :obj:`DistributedDataParallel`,
+            it is the number of training samples on each GPU.
+            When model is :obj:`DataParallel`, it is
+            `num_gpus * samples_per_gpu`.
+            Default : 1.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+        num_sample_class (int): The number of samples taken from each
+            per-label list. Default: 1
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None,
+                 seed=0,
+                 num_sample_class=1):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.samples_per_gpu = samples_per_gpu
+        self.rank = rank
+        self.epoch = 0
+        # Must be the same across all workers. If None, will use a
+        # random seed shared among workers
+        # (require synchronization among all workers)
+        self.seed = sync_random_seed(seed)
+
+        # The number of samples taken from each per-label list
+        assert num_sample_class > 0 and isinstance(num_sample_class, int)
+        self.num_sample_class = num_sample_class
+        # Get per-label image list from dataset
+        assert hasattr(dataset, 'get_cat2imgs'), \
+            'dataset must have `get_cat2imgs` function'
+        self.cat_dict = dataset.get_cat2imgs()
+
+        self.num_samples = int(
+            math.ceil(
+                len(self.dataset) * 1.0 / self.num_replicas /
+                self.samples_per_gpu)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+        # get number of images containing each category
+        self.num_cat_imgs = [len(x) for x in self.cat_dict.values()]
+        # filter labels without images
+        self.valid_cat_inds = [
+            i for i, length in enumerate(self.num_cat_imgs) if length != 0
+        ]
+        self.num_classes = len(self.valid_cat_inds)
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        # initialize label list
+        label_iter_list = RandomCycleIter(self.valid_cat_inds, generator=g)
+        # initialize each per-label image list
+        data_iter_dict = dict()
+        for i in self.valid_cat_inds:
+            data_iter_dict[i] = RandomCycleIter(self.cat_dict[i], generator=g)
+
+        def gen_cat_img_inds(cls_list, data_dict, num_sample_cls):
+            """Traverse the categories and extract `num_sample_cls` image
+            indexes of the corresponding categories one by one."""
+            id_indices = []
+            for _ in range(len(cls_list)):
+                cls_idx = next(cls_list)
+                for _ in range(num_sample_cls):
+                    id = next(data_dict[cls_idx])
+                    id_indices.append(id)
+            return id_indices
+
+        # deterministically shuffle based on epoch
+        num_bins = int(
+            math.ceil(self.total_size * 1.0 / self.num_classes /
+                      self.num_sample_class))
+        indices = []
+        for i in range(num_bins):
+            indices += gen_cat_img_inds(label_iter_list, data_iter_dict,
+                                        self.num_sample_class)
+
+        # fix extra samples to make it evenly divisible
+        if len(indices) >= self.total_size:
+            indices = indices[:self.total_size]
+        else:
+            indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class RandomCycleIter:
+    """Shuffle the list and do it again after the list have traversed.
+
+    The implementation logic is referred to
+    https://github.com/wutong16/DistributionBalancedLoss/blob/master/mllt/datasets/loader/sampler.py
+
+    Example:
+        >>> label_list = [0, 1, 2, 4, 5]
+        >>> g = torch.Generator()
+        >>> g.manual_seed(0)
+        >>> label_iter_list = RandomCycleIter(label_list, generator=g)
+        >>> index = next(label_iter_list)
+    Args:
+        data (list or ndarray): The data that needs to be shuffled.
+        generator: An torch.Generator object, which is used in setting the seed
+            for generating random numbers.
+    """  # noqa: W605
+
+    def __init__(self, data, generator=None):
+        self.data = data
+        self.length = len(data)
+        self.index = torch.randperm(self.length, generator=generator).numpy()
+        self.i = 0
+        self.generator = generator
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return len(self.data)
+
+    def __next__(self):
+        if self.i == self.length:
+            self.index = torch.randperm(
+                self.length, generator=self.generator).numpy()
+            self.i = 0
+        idx = self.data[self.index[self.i]]
+        self.i += 1
+        return idx
diff --git a/mmdet/datasets/samplers/distributed_sampler.py b/mmdet/datasets/samplers/distributed_sampler.py
new file mode 100755
index 0000000..1bc8b7c
--- /dev/null
+++ b/mmdet/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+from mmdet.core.utils import sync_random_seed
+from mmdet.utils import get_device
+
+
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        device = get_device()
+        self.seed = sync_random_seed(seed, device)
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            # When :attr:`shuffle=True`, this ensures all replicas
+            # use a different random ordering for each epoch.
+            # Otherwise, the next iteration of this sampler will
+            # yield the same ordering.
+            g.manual_seed(self.epoch + self.seed)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices *
+                   math.ceil(self.total_size / len(indices)))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/mmdet/datasets/samplers/group_sampler.py b/mmdet/datasets/samplers/group_sampler.py
new file mode 100755
index 0000000..783d2b2
--- /dev/null
+++ b/mmdet/datasets/samplers/group_sampler.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate(
+                [indice, np.random.choice(indice, num_extra)])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = indices.astype(np.int64).tolist()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None,
+                 seed=0):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed if seed is not None else 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # add .numpy() to avoid bug when selecting indice in parrots.
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(
+                    torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/mmdet/datasets/samplers/infinite_sampler.py b/mmdet/datasets/samplers/infinite_sampler.py
new file mode 100755
index 0000000..d42487e
--- /dev/null
+++ b/mmdet/datasets/samplers/infinite_sampler.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data.sampler import Sampler
+
+from mmdet.core.utils import sync_random_seed
+
+
+class InfiniteGroupBatchSampler(Sampler):
+    """Similar to `BatchSampler` warping a `GroupSampler. It is designed for
+    iteration-based runners like `IterBasedRunner` and yields a mini-batch
+    indices each time, all indices in a batch should be in the same group.
+
+    The implementation logic is referred to
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/grouped_batch_sampler.py
+
+    Args:
+        dataset (object): The dataset.
+        batch_size (int): When model is :obj:`DistributedDataParallel`,
+            it is the number of training samples on each GPU.
+            When model is :obj:`DataParallel`, it is
+            `num_gpus * samples_per_gpu`.
+            Default : 1.
+        world_size (int, optional): Number of processes participating in
+            distributed training. Default: None.
+        rank (int, optional): Rank of current process. Default: None.
+        seed (int): Random seed. Default: 0.
+        shuffle (bool): Whether shuffle the indices of a dummy `epoch`, it
+            should be noted that `shuffle` can not guarantee that you can
+            generate sequential indices because it need to ensure
+            that all indices in a batch is in a group. Default: True.
+    """  # noqa: W605
+
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 world_size=None,
+                 rank=None,
+                 seed=0,
+                 shuffle=True):
+        _rank, _world_size = get_dist_info()
+        if world_size is None:
+            world_size = _world_size
+        if rank is None:
+            rank = _rank
+        self.rank = rank
+        self.world_size = world_size
+        self.dataset = dataset
+        self.batch_size = batch_size
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
+        self.shuffle = shuffle
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+        # buffer used to save indices of each group
+        self.buffer_per_group = {k: [] for k in range(len(self.group_sizes))}
+
+        self.size = len(dataset)
+        self.indices = self._indices_of_rank()
+
+    def _infinite_indices(self):
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(self.size, generator=g).tolist()
+
+            else:
+                yield from torch.arange(self.size).tolist()
+
+    def _indices_of_rank(self):
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(self._infinite_indices(), self.rank, None,
+                                    self.world_size)
+
+    def __iter__(self):
+        # once batch size is reached, yield the indices
+        for idx in self.indices:
+            flag = self.flag[idx]
+            group_buffer = self.buffer_per_group[flag]
+            group_buffer.append(idx)
+            if len(group_buffer) == self.batch_size:
+                yield group_buffer[:]
+                del group_buffer[:]
+
+    def __len__(self):
+        """Length of base dataset."""
+        return self.size
+
+    def set_epoch(self, epoch):
+        """Not supported in `IterationBased` runner."""
+        raise NotImplementedError
+
+
+class InfiniteBatchSampler(Sampler):
+    """Similar to `BatchSampler` warping a `DistributedSampler. It is designed
+    iteration-based runners like `IterBasedRunner` and yields a mini-batch
+    indices each time.
+
+    The implementation logic is referred to
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/grouped_batch_sampler.py
+
+    Args:
+        dataset (object): The dataset.
+        batch_size (int): When model is :obj:`DistributedDataParallel`,
+            it is the number of training samples on each GPU,
+            When model is :obj:`DataParallel`, it is
+            `num_gpus * samples_per_gpu`.
+            Default : 1.
+        world_size (int, optional): Number of processes participating in
+            distributed training. Default: None.
+        rank (int, optional): Rank of current process. Default: None.
+        seed (int): Random seed. Default: 0.
+        shuffle (bool): Whether shuffle the dataset or not. Default: True.
+    """  # noqa: W605
+
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 world_size=None,
+                 rank=None,
+                 seed=0,
+                 shuffle=True):
+        _rank, _world_size = get_dist_info()
+        if world_size is None:
+            world_size = _world_size
+        if rank is None:
+            rank = _rank
+        self.rank = rank
+        self.world_size = world_size
+        self.dataset = dataset
+        self.batch_size = batch_size
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
+        self.shuffle = shuffle
+        self.size = len(dataset)
+        self.indices = self._indices_of_rank()
+
+    def _infinite_indices(self):
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(self.size, generator=g).tolist()
+
+            else:
+                yield from torch.arange(self.size).tolist()
+
+    def _indices_of_rank(self):
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(self._infinite_indices(), self.rank, None,
+                                    self.world_size)
+
+    def __iter__(self):
+        # once batch size is reached, yield the indices
+        batch_buffer = []
+        for idx in self.indices:
+            batch_buffer.append(idx)
+            if len(batch_buffer) == self.batch_size:
+                yield batch_buffer
+                batch_buffer = []
+
+    def __len__(self):
+        """Length of base dataset."""
+        return self.size
+
+    def set_epoch(self, epoch):
+        """Not supported in `IterationBased` runner."""
+        raise NotImplementedError
diff --git a/mmdet/datasets/utils.py b/mmdet/datasets/utils.py
new file mode 100755
index 0000000..26e922d
--- /dev/null
+++ b/mmdet/datasets/utils.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+from mmcv.cnn import VGG
+from mmcv.runner.hooks import HOOKS, Hook
+
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import (LoadAnnotations, LoadImageFromFile,
+                                      LoadPanopticAnnotations)
+from mmdet.models.dense_heads import GARPNHead, RPNHead
+from mmdet.models.roi_heads.mask_heads import FusedSemanticHead
+
+
+def replace_ImageToTensor(pipelines):
+    """Replace the ImageToTensor transform in a data pipeline to
+    DefaultFormatBundle, which is normally useful in batch inference.
+
+    Args:
+        pipelines (list[dict]): Data pipeline configs.
+
+    Returns:
+        list: The new pipeline list with all ImageToTensor replaced by
+            DefaultFormatBundle.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(
+        ...        type='MultiScaleFlipAug',
+        ...        img_scale=(1333, 800),
+        ...        flip=False,
+        ...        transforms=[
+        ...            dict(type='Resize', keep_ratio=True),
+        ...            dict(type='RandomFlip'),
+        ...            dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+        ...            dict(type='Pad', size_divisor=32),
+        ...            dict(type='ImageToTensor', keys=['img']),
+        ...            dict(type='Collect', keys=['img']),
+        ...        ])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(
+        ...        type='MultiScaleFlipAug',
+        ...        img_scale=(1333, 800),
+        ...        flip=False,
+        ...        transforms=[
+        ...            dict(type='Resize', keep_ratio=True),
+        ...            dict(type='RandomFlip'),
+        ...            dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+        ...            dict(type='Pad', size_divisor=32),
+        ...            dict(type='DefaultFormatBundle'),
+        ...            dict(type='Collect', keys=['img']),
+        ...        ])
+        ...    ]
+        >>> assert expected_pipelines == replace_ImageToTensor(pipelines)
+    """
+    pipelines = copy.deepcopy(pipelines)
+    for i, pipeline in enumerate(pipelines):
+        if pipeline['type'] == 'MultiScaleFlipAug':
+            assert 'transforms' in pipeline
+            pipeline['transforms'] = replace_ImageToTensor(
+                pipeline['transforms'])
+        elif pipeline['type'] == 'ImageToTensor':
+            warnings.warn(
+                '"ImageToTensor" pipeline is replaced by '
+                '"DefaultFormatBundle" for batch inference. It is '
+                'recommended to manually replace it in the test '
+                'data pipeline in your config file.', UserWarning)
+            pipelines[i] = {'type': 'DefaultFormatBundle'}
+    return pipelines
+
+
+def get_loading_pipeline(pipeline):
+    """Only keep loading image and annotations related configuration.
+
+    Args:
+        pipeline (list[dict]): Data pipeline configs.
+
+    Returns:
+        list[dict]: The new pipeline list with only keep
+            loading image and annotations related configuration.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True),
+        ...    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+        ...    dict(type='RandomFlip', flip_ratio=0.5),
+        ...    dict(type='Normalize', **img_norm_cfg),
+        ...    dict(type='Pad', size_divisor=32),
+        ...    dict(type='DefaultFormatBundle'),
+        ...    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True)
+        ...    ]
+        >>> assert expected_pipelines ==\
+        ...        get_loading_pipeline(pipelines)
+    """
+    loading_pipeline_cfg = []
+    for cfg in pipeline:
+        obj_cls = PIPELINES.get(cfg['type'])
+        # TODO：use more elegant way to distinguish loading modules
+        if obj_cls is not None and obj_cls in (LoadImageFromFile,
+                                               LoadAnnotations,
+                                               LoadPanopticAnnotations):
+            loading_pipeline_cfg.append(cfg)
+    assert len(loading_pipeline_cfg) == 2, \
+        'The data pipeline in your config file must include ' \
+        'loading image and annotations related pipeline.'
+    return loading_pipeline_cfg
+
+
+@HOOKS.register_module()
+class NumClassCheckHook(Hook):
+
+    def _check_head(self, runner):
+        """Check whether the `num_classes` in head matches the length of
+        `CLASSES` in `dataset`.
+
+        Args:
+            runner (obj:`EpochBasedRunner`): Epoch based Runner.
+        """
+        model = runner.model
+        dataset = runner.data_loader.dataset
+        if dataset.CLASSES is None:
+            runner.logger.warning(
+                f'Please set `CLASSES` '
+                f'in the {dataset.__class__.__name__} and'
+                f'check if it is consistent with the `num_classes` '
+                f'of head')
+        else:
+            assert type(dataset.CLASSES) is not str, \
+                (f'`CLASSES` in {dataset.__class__.__name__}'
+                 f'should be a tuple of str.'
+                 f'Add comma if number of classes is 1 as '
+                 f'CLASSES = ({dataset.CLASSES},)')
+            for name, module in model.named_modules():
+                if hasattr(module, 'num_classes') and not isinstance(
+                        module, (RPNHead, VGG, FusedSemanticHead, GARPNHead)):
+                    assert module.num_classes == len(dataset.CLASSES), \
+                        (f'The `num_classes` ({module.num_classes}) in '
+                         f'{module.__class__.__name__} of '
+                         f'{model.__class__.__name__} does not matches '
+                         f'the length of `CLASSES` '
+                         f'{len(dataset.CLASSES)}) in '
+                         f'{dataset.__class__.__name__}')
+
+    def before_train_epoch(self, runner):
+        """Check whether the training dataset is compatible with head.
+
+        Args:
+            runner (obj:`EpochBasedRunner`): Epoch based Runner.
+        """
+        self._check_head(runner)
+
+    def before_val_epoch(self, runner):
+        """Check whether the dataset in val epoch is compatible with head.
+
+        Args:
+            runner (obj:`EpochBasedRunner`): Epoch based Runner.
+        """
+        self._check_head(runner)
diff --git a/mmdet/datasets/voc.py b/mmdet/datasets/voc.py
new file mode 100755
index 0000000..0a3ea7a
--- /dev/null
+++ b/mmdet/datasets/voc.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmcv.utils import print_log
+
+from mmdet.core import eval_map, eval_recalls
+from .builder import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class VOCDataset(XMLDataset):
+
+    CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+               'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+               'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+               'tvmonitor')
+
+    PALETTE = [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+               (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
+               (153, 69, 1), (120, 166, 157), (0, 182, 199), (0, 226, 252),
+               (182, 182, 255), (0, 0, 230), (220, 20, 60), (163, 255, 0),
+               (0, 82, 0), (3, 95, 161), (0, 80, 100), (183, 130, 88)]
+
+    def __init__(self, **kwargs):
+        super(VOCDataset, self).__init__(**kwargs)
+        if 'VOC2007' in self.img_prefix:
+            self.year = 2007
+        elif 'VOC2012' in self.img_prefix:
+            self.year = 2012
+        else:
+            raise ValueError('Cannot infer dataset year from img_prefix')
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thr=0.5,
+                 scale_ranges=None):
+        """Evaluate in VOC protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'mAP', 'recall'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thr (float | list[float]): IoU threshold. Default: 0.5.
+            scale_ranges (list[tuple], optional): Scale ranges for evaluating
+                mAP. If not specified, all bounding boxes would be included in
+                evaluation. Default: None.
+
+        Returns:
+            dict[str, float]: AP/recall metrics.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP', 'recall']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+        eval_results = OrderedDict()
+        iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+        if metric == 'mAP':
+            assert isinstance(iou_thrs, list)
+            if self.year == 2007:
+                ds_name = 'voc07'
+            else:
+                ds_name = self.CLASSES
+            mean_aps = []
+            for iou_thr in iou_thrs:
+                print_log(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+                # Follow the official implementation,
+                # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
+                # we should use the legacy coordinate system in mmdet 1.x,
+                # which means w, h should be computed as 'x2 - x1 + 1` and
+                # `y2 - y1 + 1`
+                mean_ap, _ = eval_map(
+                    results,
+                    annotations,
+                    scale_ranges=None,
+                    iou_thr=iou_thr,
+                    dataset=ds_name,
+                    logger=logger,
+                    use_legacy_coordinate=True)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+            eval_results.move_to_end('mAP', last=False)
+        elif metric == 'recall':
+            gt_bboxes = [ann['bboxes'] for ann in annotations]
+            recalls = eval_recalls(
+                gt_bboxes,
+                results,
+                proposal_nums,
+                iou_thrs,
+                logger=logger,
+                use_legacy_coordinate=True)
+            for i, num in enumerate(proposal_nums):
+                for j, iou_thr in enumerate(iou_thrs):
+                    eval_results[f'recall@{num}@{iou_thr}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
diff --git a/mmdet/datasets/wider_face.py b/mmdet/datasets/wider_face.py
new file mode 100755
index 0000000..85a5fdc
--- /dev/null
+++ b/mmdet/datasets/wider_face.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+import mmcv
+
+from .builder import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class WIDERFaceDataset(XMLDataset):
+    """Reader for the WIDER Face dataset in PASCAL VOC format.
+
+    Conversion scripts can be found in
+    https://github.com/sovrasov/wider-face-pascal-voc-annotations
+    """
+    CLASSES = ('face', )
+
+    PALETTE = [(0, 255, 0)]
+
+    def __init__(self, **kwargs):
+        super(WIDERFaceDataset, self).__init__(**kwargs)
+
+    def load_annotations(self, ann_file):
+        """Load annotation from WIDERFace XML style annotation file.
+
+        Args:
+            ann_file (str): Path of XML file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+
+        data_infos = []
+        img_ids = mmcv.list_from_file(ann_file)
+        for img_id in img_ids:
+            filename = f'{img_id}.jpg'
+            xml_path = osp.join(self.img_prefix, 'Annotations',
+                                f'{img_id}.xml')
+            tree = ET.parse(xml_path)
+            root = tree.getroot()
+            size = root.find('size')
+            width = int(size.find('width').text)
+            height = int(size.find('height').text)
+            folder = root.find('folder').text
+            data_infos.append(
+                dict(
+                    id=img_id,
+                    filename=osp.join(folder, filename),
+                    width=width,
+                    height=height))
+
+        return data_infos
diff --git a/mmdet/datasets/xml_style.py b/mmdet/datasets/xml_style.py
new file mode 100755
index 0000000..c66e875
--- /dev/null
+++ b/mmdet/datasets/xml_style.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class XMLDataset(CustomDataset):
+    """XML dataset for detection.
+
+    Args:
+        min_size (int | float, optional): The minimum size of bounding
+            boxes in the images. If the size of a bounding box is less than
+            ``min_size``, it would be add to ignored field.
+        img_subdir (str): Subdir where images are stored. Default: JPEGImages.
+        ann_subdir (str): Subdir where annotations are. Default: Annotations.
+    """
+
+    def __init__(self,
+                 min_size=None,
+                 img_subdir='JPEGImages',
+                 ann_subdir='Annotations',
+                 **kwargs):
+        assert self.CLASSES or kwargs.get(
+            'classes', None), 'CLASSES in `XMLDataset` can not be None.'
+        self.img_subdir = img_subdir
+        self.ann_subdir = ann_subdir
+        super(XMLDataset, self).__init__(**kwargs)
+        self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
+        self.min_size = min_size
+
+    def load_annotations(self, ann_file):
+        """Load annotation from XML style ann_file.
+
+        Args:
+            ann_file (str): Path of XML file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+
+        data_infos = []
+        img_ids = mmcv.list_from_file(ann_file)
+        for img_id in img_ids:
+            filename = osp.join(self.img_subdir, f'{img_id}.jpg')
+            xml_path = osp.join(self.img_prefix, self.ann_subdir,
+                                f'{img_id}.xml')
+            tree = ET.parse(xml_path)
+            root = tree.getroot()
+            size = root.find('size')
+            if size is not None:
+                width = int(size.find('width').text)
+                height = int(size.find('height').text)
+            else:
+                img_path = osp.join(self.img_prefix, filename)
+                img = Image.open(img_path)
+                width, height = img.size
+            data_infos.append(
+                dict(id=img_id, filename=filename, width=width, height=height))
+
+        return data_infos
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without annotation."""
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if min(img_info['width'], img_info['height']) < min_size:
+                continue
+            if self.filter_empty_gt:
+                img_id = img_info['id']
+                xml_path = osp.join(self.img_prefix, self.ann_subdir,
+                                    f'{img_id}.xml')
+                tree = ET.parse(xml_path)
+                root = tree.getroot()
+                for obj in root.findall('object'):
+                    name = obj.find('name').text
+                    if name in self.CLASSES:
+                        valid_inds.append(i)
+                        break
+            else:
+                valid_inds.append(i)
+        return valid_inds
+
+    def get_ann_info(self, idx):
+        """Get annotation from XML file by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        xml_path = osp.join(self.img_prefix, self.ann_subdir, f'{img_id}.xml')
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        bboxes = []
+        labels = []
+        bboxes_ignore = []
+        labels_ignore = []
+        min_w = np.inf
+        min_h = np.inf
+        for obj in root.findall('object'):
+            name = obj.find('name').text
+            if name not in self.CLASSES:
+                continue
+            label = self.cat2label[name]
+            difficult = obj.find('difficult')
+            difficult = 0 if difficult is None else int(difficult.text)
+            bnd_box = obj.find('bndbox')
+            # TODO: check whether it is necessary to use int
+            # Coordinates may be float type
+            bbox = [
+                int(float(bnd_box.find('xmin').text)),
+                int(float(bnd_box.find('ymin').text)),
+                int(float(bnd_box.find('xmax').text)),
+                int(float(bnd_box.find('ymax').text))
+            ]
+            if (min_w > bbox[2] - bbox[0]):
+                min_w = bbox[2] - bbox[0]
+            if (min_h > bbox[3] - bbox[1]):
+                min_h = bbox[3] - bbox[1] 
+            ignore = False
+            if self.min_size:
+                assert not self.test_mode
+                w = bbox[2] - bbox[0]
+                h = bbox[3] - bbox[1]
+                if w < self.min_size or h < self.min_size:
+                    ignore = True
+            if difficult or ignore:
+                bboxes_ignore.append(bbox)
+                labels_ignore.append(label)
+            else:
+                bboxes.append(bbox)
+                labels.append(label)
+        if self.min_size and (min_w < self.min_size or min_h < self.min_size):
+            print(f'{img_id} - {min_w} - {min_h}')
+        if not bboxes:
+            bboxes = np.zeros((0, 4))
+            labels = np.zeros((0, ))
+        else:
+            bboxes = np.array(bboxes, ndmin=2) - 1
+            labels = np.array(labels)
+        if not bboxes_ignore:
+            bboxes_ignore = np.zeros((0, 4))
+            labels_ignore = np.zeros((0, ))
+        else:
+            bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
+            labels_ignore = np.array(labels_ignore)
+        ann = dict(
+            bboxes=bboxes.astype(np.float32),
+            labels=labels.astype(np.int64),
+            bboxes_ignore=bboxes_ignore.astype(np.float32),
+            labels_ignore=labels_ignore.astype(np.int64))
+        return ann
+
+    def get_cat_ids(self, idx):
+        """Get category ids in XML file by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        cat_ids = []
+        img_id = self.data_infos[idx]['id']
+        xml_path = osp.join(self.img_prefix, self.ann_subdir, f'{img_id}.xml')
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        for obj in root.findall('object'):
+            name = obj.find('name').text
+            if name not in self.CLASSES:
+                continue
+            label = self.cat2label[name]
+            cat_ids.append(label)
+
+        return cat_ids
diff --git a/mmdet/models/__init__.py b/mmdet/models/__init__.py
new file mode 100755
index 0000000..12efb01
--- /dev/null
+++ b/mmdet/models/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .builder import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS,
+                      ROI_EXTRACTORS, SHARED_HEADS, build_backbone,
+                      build_detector, build_head, build_loss, build_neck,
+                      build_roi_extractor, build_shared_head)
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .plugins import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+from .seg_heads import *  # noqa: F401,F403
+
+__all__ = [
+    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
+    'DETECTORS', 'build_backbone', 'build_neck', 'build_roi_extractor',
+    'build_shared_head', 'build_head', 'build_loss', 'build_detector'
+]
diff --git a/mmdet/models/backbones/__init__.py b/mmdet/models/backbones/__init__.py
new file mode 100755
index 0000000..f87c5c0
--- /dev/null
+++ b/mmdet/models/backbones/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .csp_darknet import CSPDarknet
+from .darknet import Darknet
+from .detectors_resnet import DetectoRS_ResNet
+from .detectors_resnext import DetectoRS_ResNeXt
+from .efficientnet import EfficientNet
+from .hourglass import HourglassNet
+from .hrnet import HRNet
+from .mobilenet_v2 import MobileNetV2
+from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+from .regnet import RegNet
+from .res2net import Res2Net
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
+from .swin import SwinTransformer
+from .trident_resnet import TridentResNet
+from .resnetclip import ModifiedResNet
+
+__all__ = [
+    'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet',
+    'MobileNetV2', 'Res2Net', 'HourglassNet', 'DetectoRS_ResNet',
+    'DetectoRS_ResNeXt', 'Darknet', 'ResNeSt', 'TridentResNet', 'CSPDarknet',
+    'SwinTransformer', 'PyramidVisionTransformer',
+    'PyramidVisionTransformerV2', 'EfficientNet', 'ModifiedResNet'
+]
diff --git a/mmdet/models/backbones/csp_darknet.py b/mmdet/models/backbones/csp_darknet.py
new file mode 100755
index 0000000..2bbf396
--- /dev/null
+++ b/mmdet/models/backbones/csp_darknet.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import CSPLayer
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_size (int): The kernel size of the convolution. Default: 1
+        stride (int): The stride of the convolution. Default: 1
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish')):
+        super().__init__()
+        self.conv = ConvModule(
+            in_channels * 4,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class SPPBottleneck(BaseModule):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        mid_channels = in_channels // 2
+        self.conv1 = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.poolings = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = mid_channels * (len(kernel_sizes) + 1)
+        self.conv2 = ConvModule(
+            conv2_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [pooling(x) for pooling in self.poolings], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+@BACKBONES.register_module()
+class CSPDarknet(BaseModule):
+    """CSP-Darknet backbone used in YOLOv5 and YOLOX.
+
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Default: P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Default: 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Default: -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False.
+        arch_ovewrite(list): Overwrite default arch settings. Default: None.
+        spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Example:
+        >>> from mmdet.models import CSPDarknet
+        >>> import torch
+        >>> self = CSPDarknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 deepen_factor=1.0,
+                 widen_factor=1.0,
+                 out_indices=(2, 3, 4),
+                 frozen_stages=-1,
+                 use_depthwise=False,
+                 arch_ovewrite=None,
+                 spp_kernal_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super().__init__(init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        self.stem = Focus(
+            3,
+            int(arch_setting[0][0] * widen_factor),
+            kernel_size=3,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernal_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(CSPDarknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmdet/models/backbones/darknet.py b/mmdet/models/backbones/darknet.py
new file mode 100755
index 0000000..adfb115
--- /dev/null
+++ b/mmdet/models/backbones/darknet.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+
+
+class ResBlock(BaseModule):
+    """The basic residual block used in Darknet. Each ResBlock consists of two
+    ConvModules and the input is added to the final output. Each ConvModule is
+    composed of Conv, BN, and LeakyReLU. In YoloV3 paper, the first convLayer
+    has half of the number of the filters as much as the second convLayer. The
+    first convLayer has filter size of 1x1 and the second one has the filter
+    size of 3x3.
+
+    Args:
+        in_channels (int): The input channels. Must be even.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 init_cfg=None):
+        super(ResBlock, self).__init__(init_cfg)
+        assert in_channels % 2 == 0  # ensure the in_channels is even
+        half_in_channels = in_channels // 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            half_in_channels, in_channels, 3, padding=1, **cfg)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = out + residual
+
+        return out
+
+
+@BACKBONES.register_module()
+class Darknet(BaseModule):
+    """Darknet backbone.
+
+    Args:
+        depth (int): Depth of Darknet. Currently only support 53.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Darknet
+        >>> import torch
+        >>> self = Darknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+
+    # Dict(depth: (layers, channels))
+    arch_settings = {
+        53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 1024)))
+    }
+
+    def __init__(self,
+                 depth=53,
+                 out_indices=(3, 4, 5),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(Darknet, self).__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for darknet')
+
+        self.depth = depth
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.layers, self.channels = self.arch_settings[depth]
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg)
+
+        self.cr_blocks = ['conv1']
+        for i, n_layers in enumerate(self.layers):
+            layer_name = f'conv_res_block{i + 1}'
+            in_c, out_c = self.channels[i]
+            self.add_module(
+                layer_name,
+                self.make_conv_res_block(in_c, out_c, n_layers, **cfg))
+            self.cr_blocks.append(layer_name)
+
+        self.norm_eval = norm_eval
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.cr_blocks):
+            cr_block = getattr(self, layer_name)
+            x = cr_block(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                m = getattr(self, self.cr_blocks[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(Darknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    @staticmethod
+    def make_conv_res_block(in_channels,
+                            out_channels,
+                            res_repeat,
+                            conv_cfg=None,
+                            norm_cfg=dict(type='BN', requires_grad=True),
+                            act_cfg=dict(type='LeakyReLU',
+                                         negative_slope=0.1)):
+        """In Darknet backbone, ConvLayer is usually followed by ResBlock. This
+        function will make that. The Conv layers always have 3x3 filters with
+        stride=2. The number of the filters in Conv layer is the same as the
+        out channels of the ResBlock.
+
+        Args:
+            in_channels (int): The number of input channels.
+            out_channels (int): The number of output channels.
+            res_repeat (int): The number of ResBlocks.
+            conv_cfg (dict): Config dict for convolution layer. Default: None.
+            norm_cfg (dict): Dictionary to construct and config norm layer.
+                Default: dict(type='BN', requires_grad=True)
+            act_cfg (dict): Config dict for activation layer.
+                Default: dict(type='LeakyReLU', negative_slope=0.1).
+        """
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        model = nn.Sequential()
+        model.add_module(
+            'conv',
+            ConvModule(
+                in_channels, out_channels, 3, stride=2, padding=1, **cfg))
+        for idx in range(res_repeat):
+            model.add_module('res{}'.format(idx),
+                             ResBlock(out_channels, **cfg))
+        return model
diff --git a/mmdet/models/backbones/detectors_resnet.py b/mmdet/models/backbones/detectors_resnet.py
new file mode 100755
index 0000000..a3c0d40
--- /dev/null
+++ b/mmdet/models/backbones/detectors_resnet.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      kaiming_init)
+from mmcv.runner import Sequential, load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.utils import get_root_logger
+from ..builder import BACKBONES
+from .resnet import BasicBlock
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    r"""Bottleneck for the ResNet backbone in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_.
+
+    This bottleneck allows the users to specify whether to use
+    SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid).
+
+    Args:
+         inplanes (int): The number of input channels.
+         planes (int): The number of output channels before expansion.
+         rfp_inplanes (int, optional): The number of channels from RFP.
+             Default: None. If specified, an additional conv layer will be
+             added for ``rfp_feat``. Otherwise, the structure is the same as
+             base class.
+         sac (dict, optional): Dictionary to construct SAC. Default: None.
+         init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 rfp_inplanes=None,
+                 sac=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(Bottleneck, self).__init__(
+            inplanes, planes, init_cfg=init_cfg, **kwargs)
+
+        assert sac is None or isinstance(sac, dict)
+        self.sac = sac
+        self.with_sac = sac is not None
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                bias=False)
+
+        self.rfp_inplanes = rfp_inplanes
+        if self.rfp_inplanes:
+            self.rfp_conv = build_conv_layer(
+                None,
+                self.rfp_inplanes,
+                planes * self.expansion,
+                1,
+                stride=1,
+                bias=True)
+            if init_cfg is None:
+                self.init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='rfp_conv'))
+
+    def rfp_forward(self, x, rfp_feat):
+        """The forward function that also takes the RFP features as input."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        if self.rfp_inplanes:
+            rfp_feat = self.rfp_conv(rfp_feat)
+            out = out + rfp_feat
+
+        out = self.relu(out)
+
+        return out
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone for RPF in detectoRS.
+
+    The difference between this module and base class is that we pass
+    ``rfp_inplanes`` to the first block.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 rfp_inplanes=None,
+                 **kwargs):
+        self.block = block
+        assert downsample_first, f'downsample_first={downsample_first} is ' \
+                                 'not supported in DetectoRS'
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                rfp_inplanes=rfp_inplanes,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for _ in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super(ResLayer, self).__init__(*layers)
+
+
+@BACKBONES.register_module()
+class DetectoRS_ResNet(ResNet):
+    """ResNet backbone for DetectoRS.
+
+    Args:
+        sac (dict, optional): Dictionary to construct SAC (Switchable Atrous
+            Convolution). Default: None.
+        stage_with_sac (list): Which stage to use sac. Default: (False, False,
+            False, False).
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+        output_img (bool): If ``True``, the input image will be inserted into
+            the starting position of output. Default: False.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 sac=None,
+                 stage_with_sac=(False, False, False, False),
+                 rfp_inplanes=None,
+                 output_img=False,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        self.pretrained = pretrained
+        if init_cfg is not None:
+            assert isinstance(init_cfg, dict), \
+                f'init_cfg must be a dict, but got {type(init_cfg)}'
+            if 'type' in init_cfg:
+                assert init_cfg.get('type') == 'Pretrained', \
+                    'Only can initialize module by loading a pretrained model'
+            else:
+                raise KeyError('`init_cfg` must contain the key "type"')
+            self.pretrained = init_cfg.get('checkpoint')
+        self.sac = sac
+        self.stage_with_sac = stage_with_sac
+        self.rfp_inplanes = rfp_inplanes
+        self.output_img = output_img
+        super(DetectoRS_ResNet, self).__init__(**kwargs)
+
+        self.inplanes = self.stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            sac = self.sac if self.stage_with_sac[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                sac=sac,
+                rfp_inplanes=rfp_inplanes if i > 0 else None,
+                plugins=stage_plugins)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+    # In order to be properly initialized by RFP
+    def init_weights(self):
+        # Calling this method will cause parameter initialization exception
+        # super(DetectoRS_ResNet, self).init_weights()
+
+        if isinstance(self.pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.dcn is not None:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck) and hasattr(
+                            m.conv2, 'conv_offset'):
+                        constant_init(m.conv2.conv_offset, 0)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS."""
+        return ResLayer(**kwargs)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = list(super(DetectoRS_ResNet, self).forward(x))
+        if self.output_img:
+            outs.insert(0, x)
+        return tuple(outs)
+
+    def rfp_forward(self, x, rfp_feats):
+        """Forward function for RFP."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            rfp_feat = rfp_feats[i] if i > 0 else None
+            for layer in res_layer:
+                x = layer.rfp_forward(x, rfp_feat)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmdet/models/backbones/detectors_resnext.py b/mmdet/models/backbones/detectors_resnext.py
new file mode 100755
index 0000000..5e8b20a
--- /dev/null
+++ b/mmdet/models/backbones/detectors_resnext.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .detectors_resnet import Bottleneck as _Bottleneck
+from .detectors_resnet import DetectoRS_ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        elif not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class DetectoRS_ResNeXt(DetectoRS_ResNet):
+    """ResNeXt backbone for DetectoRS.
+
+    Args:
+        groups (int): The number of groups in ResNeXt.
+        base_width (int): The base width of ResNeXt.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(DetectoRS_ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return super().make_res_layer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmdet/models/backbones/efficientnet.py b/mmdet/models/backbones/efficientnet.py
new file mode 100755
index 0000000..7ee3595
--- /dev/null
+++ b/mmdet/models/backbones/efficientnet.py
@@ -0,0 +1,417 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn.bricks import ConvModule, DropPath
+from mmcv.runner import BaseModule, Sequential
+
+from ..builder import BACKBONES
+from ..utils import InvertedResidual, SELayer, make_divisible
+
+
+class EdgeResidual(BaseModule):
+    """Edge Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this module.
+        out_channels (int): The output channels of this module.
+        mid_channels (int): The input channels of the second convolution.
+        kernel_size (int): The kernel size of the first convolution.
+            Defaults to 3.
+        stride (int): The stride of the first convolution. Defaults to 1.
+        se_cfg (dict, optional): Config dict for se layer. Defaults to None,
+            which means no se layer.
+        with_residual (bool): Use residual connection. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='BN')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_residual=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None,
+                 **kwargs):
+        super(EdgeResidual, self).__init__(init_cfg=init_cfg)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_residual = (
+            stride == 1 and in_channels == out_channels and with_residual)
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.conv2 = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+            out = self.conv1(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.conv2(out)
+
+            if self.with_residual:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+def model_scaling(layer_setting, arch_setting):
+    """Scaling operation to the layer's parameters according to the
+    arch_setting."""
+    # scale width
+    new_layer_setting = copy.deepcopy(layer_setting)
+    for layer_cfg in new_layer_setting:
+        for block_cfg in layer_cfg:
+            block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8)
+
+    # scale depth
+    split_layer_setting = [new_layer_setting[0]]
+    for layer_cfg in new_layer_setting[1:-1]:
+        tmp_index = [0]
+        for i in range(len(layer_cfg) - 1):
+            if layer_cfg[i + 1][1] != layer_cfg[i][1]:
+                tmp_index.append(i + 1)
+        tmp_index.append(len(layer_cfg))
+        for i in range(len(tmp_index) - 1):
+            split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i +
+                                                                        1]])
+    split_layer_setting.append(new_layer_setting[-1])
+
+    num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]]
+    new_layers = [
+        int(math.ceil(arch_setting[1] * num)) for num in num_of_layers
+    ]
+
+    merge_layer_setting = [split_layer_setting[0]]
+    for i, layer_cfg in enumerate(split_layer_setting[1:-1]):
+        if new_layers[i] <= num_of_layers[i]:
+            tmp_layer_cfg = layer_cfg[:new_layers[i]]
+        else:
+            tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * (
+                new_layers[i] - num_of_layers[i])
+        if tmp_layer_cfg[0][3] == 1 and i != 0:
+            merge_layer_setting[-1] += tmp_layer_cfg.copy()
+        else:
+            merge_layer_setting.append(tmp_layer_cfg.copy())
+    merge_layer_setting.append(split_layer_setting[-1])
+
+    return merge_layer_setting
+
+
+@BACKBONES.register_module()
+class EfficientNet(BaseModule):
+    """EfficientNet backbone.
+
+    Args:
+        arch (str): Architecture of efficientnet. Defaults to b0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (6, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Defaults to 0, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    # Parameters to build layers.
+    # 'b' represents the architecture of normal EfficientNet family includes
+    # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'.
+    # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es',
+    # 'em', 'el'.
+    # 6 parameters are needed to construct a layer, From left to right:
+    # - kernel_size: The kernel size of the block
+    # - out_channel: The number of out_channels of the block
+    # - se_ratio: The sequeeze ratio of SELayer.
+    # - stride: The stride of the block
+    # - expand_ratio: The expand_ratio of the mid_channels
+    # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual
+    layer_settings = {
+        'b': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 16, 4, 1, 1, 0]],
+              [[3, 24, 4, 2, 6, 0],
+               [3, 24, 4, 1, 6, 0]],
+              [[5, 40, 4, 2, 6, 0],
+               [5, 40, 4, 1, 6, 0]],
+              [[3, 80, 4, 2, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0]],
+              [[5, 192, 4, 2, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [3, 320, 4, 1, 6, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ],
+        'e': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 24, 0, 1, 3, 1]],
+              [[3, 32, 0, 2, 8, 1],
+               [3, 32, 0, 1, 8, 1]],
+              [[3, 48, 0, 2, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1]],
+              [[5, 96, 0, 2, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0]],
+              [[5, 192, 0, 2, 8, 0],
+               [5, 192, 0, 1, 8, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ]
+    }  # yapf: disable
+
+    # Parameters to build different kinds of architecture.
+    # From left to right: scaling factor for width, scaling factor for depth,
+    # resolution.
+    arch_settings = {
+        'b0': (1.0, 1.0, 224),
+        'b1': (1.0, 1.1, 240),
+        'b2': (1.1, 1.2, 260),
+        'b3': (1.2, 1.4, 300),
+        'b4': (1.4, 1.8, 380),
+        'b5': (1.6, 2.2, 456),
+        'b6': (1.8, 2.6, 528),
+        'b7': (2.0, 3.1, 600),
+        'b8': (2.2, 3.6, 672),
+        'es': (1.0, 1.0, 224),
+        'em': (1.0, 1.1, 240),
+        'el': (1.2, 1.4, 300)
+    }
+
+    def __init__(self,
+                 arch='b0',
+                 drop_path_rate=0.,
+                 out_indices=(6, ),
+                 frozen_stages=0,
+                 conv_cfg=dict(type='Conv2dAdaptivePadding'),
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 with_cp=False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['_BatchNorm', 'GroupNorm'],
+                         val=1)
+                 ]):
+        super(EfficientNet, self).__init__(init_cfg)
+        assert arch in self.arch_settings, \
+            f'"{arch}" is not one of the arch_settings ' \
+            f'({", ".join(self.arch_settings.keys())})'
+        self.arch_setting = self.arch_settings[arch]
+        self.layer_setting = self.layer_settings[arch[:1]]
+        for index in out_indices:
+            if index not in range(0, len(self.layer_setting)):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.layer_setting)}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(len(self.layer_setting) + 1):
+            raise ValueError('frozen_stages must be in range(0, '
+                             f'{len(self.layer_setting) + 1}). '
+                             f'But received {frozen_stages}')
+        self.drop_path_rate = drop_path_rate
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.layer_setting = model_scaling(self.layer_setting,
+                                           self.arch_setting)
+        block_cfg_0 = self.layer_setting[0][0]
+        block_cfg_last = self.layer_setting[-1][0]
+        self.in_channels = make_divisible(block_cfg_0[1], 8)
+        self.out_channels = block_cfg_last[1]
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvModule(
+                in_channels=3,
+                out_channels=self.in_channels,
+                kernel_size=block_cfg_0[0],
+                stride=block_cfg_0[3],
+                padding=block_cfg_0[0] // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.make_layer()
+        # Avoid building unused layers in mmdetection.
+        if len(self.layers) < max(self.out_indices) + 1:
+            self.layers.append(
+                ConvModule(
+                    in_channels=self.in_channels,
+                    out_channels=self.out_channels,
+                    kernel_size=block_cfg_last[0],
+                    stride=block_cfg_last[3],
+                    padding=block_cfg_last[0] // 2,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def make_layer(self):
+        # Without the first and the final conv block.
+        layer_setting = self.layer_setting[1:-1]
+
+        total_num_blocks = sum([len(x) for x in layer_setting])
+        block_idx = 0
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
+        ]  # stochastic depth decay rule
+
+        for i, layer_cfg in enumerate(layer_setting):
+            # Avoid building unused layers in mmdetection.
+            if i > max(self.out_indices) - 1:
+                break
+            layer = []
+            for i, block_cfg in enumerate(layer_cfg):
+                (kernel_size, out_channels, se_ratio, stride, expand_ratio,
+                 block_type) = block_cfg
+
+                mid_channels = int(self.in_channels * expand_ratio)
+                out_channels = make_divisible(out_channels, 8)
+                if se_ratio <= 0:
+                    se_cfg = None
+                else:
+                    # In mmdetection, the `divisor` is deleted to align
+                    # the logic of SELayer with mmcls.
+                    se_cfg = dict(
+                        channels=mid_channels,
+                        ratio=expand_ratio * se_ratio,
+                        act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                if block_type == 1:  # edge tpu
+                    if i > 0 and expand_ratio == 3:
+                        with_residual = False
+                        expand_ratio = 4
+                    else:
+                        with_residual = True
+                    mid_channels = int(self.in_channels * expand_ratio)
+                    if se_cfg is not None:
+                        # In mmdetection, the `divisor` is deleted to align
+                        # the logic of SELayer with mmcls.
+                        se_cfg = dict(
+                            channels=mid_channels,
+                            ratio=se_ratio * expand_ratio,
+                            act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                    block = partial(EdgeResidual, with_residual=with_residual)
+                else:
+                    block = InvertedResidual
+                layer.append(
+                    block(
+                        in_channels=self.in_channels,
+                        out_channels=out_channels,
+                        mid_channels=mid_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        se_cfg=se_cfg,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        drop_path_rate=dpr[block_idx],
+                        with_cp=self.with_cp,
+                        # In mmdetection, `with_expand_conv` is set to align
+                        # the logic of InvertedResidual with mmcls.
+                        with_expand_conv=(mid_channels != self.in_channels)))
+                self.in_channels = out_channels
+                block_idx += 1
+            self.layers.append(Sequential(*layer))
+
+    def forward(self, x):
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(EfficientNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/mmdet/models/backbones/hourglass.py b/mmdet/models/backbones/hourglass.py
new file mode 100755
index 0000000..f0dfb43
--- /dev/null
+++ b/mmdet/models/backbones/hourglass.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import BasicBlock
+
+
+class HourglassModule(BaseModule):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+        upsample_cfg (dict, optional): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 stage_blocks,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 init_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        super(HourglassModule, self).__init__(init_cfg)
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_channel, cur_channel, cur_block, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_channel,
+            next_channel,
+            cur_block,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_channel,
+                next_channel,
+                next_block,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            next_channel,
+            cur_channel,
+            cur_block,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = F.interpolate
+        self.upsample_cfg = upsample_cfg
+
+    def forward(self, x):
+        """Forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        # Fixing `scale factor` (e.g. 2) is common for upsampling, but
+        # in some cases the spatial size is mismatched and error will arise.
+        if 'scale_factor' in self.upsample_cfg:
+            up2 = self.up2(low3, **self.upsample_cfg)
+        else:
+            shape = up1.shape[2:]
+            up2 = self.up2(low3, size=shape, **self.upsample_cfg)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassNet(BaseModule):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`_ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=5,
+                 num_stacks=2,
+                 stage_channels=(256, 256, 384, 384, 384, 512),
+                 stage_blocks=(2, 2, 2, 2, 2, 4),
+                 feat_channel=256,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 pretrained=None,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(HourglassNet, self).__init__(init_cfg)
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(
+                3, cur_channel // 2, 7, padding=3, stride=2,
+                norm_cfg=norm_cfg),
+            ResLayer(
+                BasicBlock,
+                cur_channel // 2,
+                cur_channel,
+                1,
+                stride=2,
+                norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            cur_channel,
+            cur_channel,
+            num_stacks - 1,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self):
+        """Init module weights."""
+        # Training Centripetal Model needs to reset parameters for Conv2d
+        super(HourglassNet, self).init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                m.reset_parameters()
+
+    def forward(self, x):
+        """Forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/mmdet/models/backbones/hrnet.py b/mmdet/models/backbones/hrnet.py
new file mode 100755
index 0000000..06c210a
--- /dev/null
+++ b/mmdet/models/backbones/hrnet.py
@@ -0,0 +1,589 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule, ModuleList, Sequential
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super(HRModule, self).__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@BACKBONES.register_module()
+class HRNet(BaseModule):
+    """HRNet backbone.
+
+    `High-Resolution Representations for Labeling Pixels and Regions
+    arXiv: <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules(int): The number of HRModule in this stage.
+                - num_branches(int): The number of branches in the HRModule.
+                - block(str): The type of convolution block.
+                - num_blocks(tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels(tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmdet.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=True,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(HRNet, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg,
+            ))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super(HRNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmdet/models/backbones/mobilenet_v2.py b/mmdet/models/backbones/mobilenet_v2.py
new file mode 100755
index 0000000..8c6fcfa
--- /dev/null
+++ b/mmdet/models/backbones/mobilenet_v2.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import InvertedResidual, make_divisible
+
+
+@BACKBONES.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int], optional): Output from which stages.
+            Default: (1, 2, 4, 7).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 out_indices=(1, 2, 4, 7),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MobileNetV2, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        if not set(out_indices).issubset(set(range(0, 8))):
+            raise ValueError('out_indices must be a subset of range'
+                             f'(0, 8). But received {out_indices}')
+
+        if frozen_stages not in range(-1, 8):
+            raise ValueError('frozen_stages must be in range(-1, 8). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Default: 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Default: 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    mid_channels=int(round(self.in_channels * expand_ratio)),
+                    stride=stride,
+                    with_expand_conv=expand_ratio != 1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        frozen."""
+        super(MobileNetV2, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmdet/models/backbones/pvt.py b/mmdet/models/backbones/pvt.py
new file mode 100755
index 0000000..8b7d5d5
--- /dev/null
+++ b/mmdet/models/backbones/pvt.py
@@ -0,0 +1,591 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (Conv2d, build_activation_layer, build_norm_layer,
+                      constant_init, normal_init, trunc_normal_init)
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmcv.cnn.utils.weight_init import trunc_normal_
+from mmcv.runner import (BaseModule, ModuleList, Sequential, _load_checkpoint,
+                         load_state_dict)
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from ...utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw, pvt_convert
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of PVT.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Depth-wise Conv to encode positional information.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`.
+        feedforward_channels (int): The hidden dimension of FFNs.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='GELU').
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+            Default: None.
+        use_conv (bool): If True, add 3x3 DWConv between two Linear layers.
+            Defaults: False.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 use_conv=False,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        if use_conv:
+            # 3x3 depth wise conv to provide positional encode information
+            dw_conv = Conv2d(
+                in_channels=feedforward_channels,
+                out_channels=feedforward_channels,
+                kernel_size=3,
+                stride=1,
+                padding=(3 - 1) // 2,
+                bias=True,
+                groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, activate, drop, fc2, drop]
+        if use_conv:
+            layers.insert(1, dw_conv)
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class SpatialReductionAttention(MultiheadAttention):
+    """An implementation of Spatial Reduction Attention of PVT.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            batch_first=batch_first,
+            dropout_layer=dropout_layer,
+            bias=qkv_bias,
+            init_cfg=init_cfg)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmdet import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'SpatialReductionAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class PVTEncoderLayer(BaseModule):
+    """Implements one encoder layer in PVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default: 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_conv_ffn=False,
+                 init_cfg=None):
+        super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = SpatialReductionAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            use_conv=use_conv_ffn,
+            act_cfg=act_cfg)
+
+    def forward(self, x, hw_shape):
+        x = self.attn(self.norm1(x), hw_shape, identity=x)
+        x = self.ffn(self.norm2(x), hw_shape, identity=x)
+
+        return x
+
+
+class AbsolutePositionEmbedding(BaseModule):
+    """An implementation of the absolute position embedding in PVT.
+
+    Args:
+        pos_shape (int): The shape of the absolute position embedding.
+        pos_dim (int): The dimension of the absolute position embedding.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default: 0.0.
+    """
+
+    def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(pos_shape, int):
+            pos_shape = to_2tuple(pos_shape)
+        elif isinstance(pos_shape, tuple):
+            if len(pos_shape) == 1:
+                pos_shape = to_2tuple(pos_shape[0])
+            assert len(pos_shape) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pos_shape)}'
+        self.pos_shape = pos_shape
+        self.pos_dim = pos_dim
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim))
+        self.drop = nn.Dropout(p=drop_rate)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+
+    def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bilinear interpolate method.
+
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shape (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'bilinear'``.
+
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C].
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = self.pos_shape
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous()
+        pos_embed_weight = F.interpolate(
+            pos_embed_weight, size=input_shape, mode=mode)
+        pos_embed_weight = torch.flatten(pos_embed_weight,
+                                         2).transpose(1, 2).contiguous()
+        pos_embed = pos_embed_weight
+
+        return pos_embed
+
+    def forward(self, x, hw_shape, mode='bilinear'):
+        pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode)
+        return self.drop(x + pos_embed)
+
+
+@BACKBONES.register_module()
+class PyramidVisionTransformer(BaseModule):
+    """Pyramid Vision Transformer (PVT)
+
+    Implementation of `Pyramid Vision Transformer: A Versatile Backbone for
+    Dense Prediction without Convolutions
+    <https://arxiv.org/pdf/2102.12122.pdf>`_.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 64.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 5, 8].
+        patch_sizes (Sequence[int]): The patch_size of each patch embedding.
+            Default: [4, 2, 2, 2].
+        strides (Sequence[int]): The stride of each patch embedding.
+            Default: [4, 2, 2, 2].
+        paddings (Sequence[int]): The padding of each patch embedding.
+            Default: [0, 0, 0, 0].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
+            embedding dim of each transformer encode layer.
+            Default: [8, 8, 4, 4].
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: True.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 5, 8],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 paddings=[0, 0, 0, 0],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratios=[8, 8, 4, 4],
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=True,
+                 norm_after_stage=False,
+                 use_conv_ffn=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 convert_weights=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.convert_weights = convert_weights
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+        self.pretrained = pretrained
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=paddings[i],
+                bias=True,
+                norm_cfg=norm_cfg)
+
+            layers = ModuleList()
+            if use_abs_pos_embed:
+                pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1])
+                pos_embed = AbsolutePositionEmbedding(
+                    pos_shape=pos_shape,
+                    pos_dim=embed_dims_i,
+                    drop_rate=drop_rate)
+                layers.append(pos_embed)
+            layers.extend([
+                PVTEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratios[i] * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    sr_ratio=sr_ratios[i],
+                    use_conv_ffn=use_conv_ffn) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            if norm_after_stage:
+                norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            else:
+                norm = nn.Identity()
+            self.layers.append(ModuleList([patch_embed, layers, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        logger = get_root_logger()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
+                elif isinstance(m, AbsolutePositionEmbedding):
+                    m.init_weights()
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            checkpoint = _load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            logger.warn(f'Load pre-trained model for '
+                        f'{self.__class__.__name__} from original repo')
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            else:
+                state_dict = checkpoint
+            if self.convert_weights:
+                # Because pvt backbones are not supported by mmcls,
+                # so we need to convert pre-trained weights to match this
+                # implementation.
+                state_dict = pvt_convert(state_dict)
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
+
+
+@BACKBONES.register_module()
+class PyramidVisionTransformerV2(PyramidVisionTransformer):
+    """Implementation of `PVTv2: Improved Baselines with Pyramid Vision
+    Transformer <https://arxiv.org/pdf/2106.13797.pdf>`_."""
+
+    def __init__(self, **kwargs):
+        super(PyramidVisionTransformerV2, self).__init__(
+            patch_sizes=[7, 3, 3, 3],
+            paddings=[3, 1, 1, 1],
+            use_abs_pos_embed=False,
+            norm_after_stage=True,
+            use_conv_ffn=True,
+            **kwargs)
diff --git a/mmdet/models/backbones/regnet.py b/mmdet/models/backbones/regnet.py
new file mode 100755
index 0000000..63adc3c
--- /dev/null
+++ b/mmdet/models/backbones/regnet.py
@@ -0,0 +1,356 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@BACKBONES.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise ValueError('Expect "arch" to be either a string '
+                             f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.zero_init_residual = zero_init_residual
+        self.block = Bottleneck
+        expansion_bak = self.block.expansion
+        self.block.expansion = 1
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                if self.zero_init_residual:
+                    block_init_cfg = dict(
+                        type='Constant', val=0, override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.inplanes = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=self.stage_widths[i],
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                groups=stage_groups,
+                base_width=group_width,
+                base_channels=self.stage_widths[i],
+                init_cfg=block_init_cfg)
+            self.inplanes = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+        self.block.expansion = expansion_bak
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def generate_regnet(self,
+                        initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number \
+                of stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divisor.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmdet/models/backbones/res2net.py b/mmdet/models/backbones/res2net.py
new file mode 100755
index 0000000..96afb2f
--- /dev/null
+++ b/mmdet/models/backbones/res2net.py
@@ -0,0 +1,327 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import Sequential
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottle2neck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 scales=4,
+                 base_width=26,
+                 base_channels=64,
+                 stage_type='normal',
+                 **kwargs):
+        """Bottle2neck block for Res2Net.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottle2neck, self).__init__(inplanes, planes, **kwargs)
+        assert scales > 1, 'Res2Net degenerates to ResNet when scales = 1.'
+        width = int(math.floor(self.planes * (base_width / base_channels)))
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width * scales, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width * scales,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        if stage_type == 'stage' and self.conv2_stride != 1:
+            self.pool = nn.AvgPool2d(
+                kernel_size=3, stride=self.conv2_stride, padding=1)
+        convs = []
+        bns = []
+
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.conv_cfg,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.dcn,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width * scales,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.stage_type = stage_type
+        self.scales = scales
+        self.width = width
+        delattr(self, 'conv2')
+        delattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            spx = torch.split(out, self.width, 1)
+            sp = self.convs[0](spx[0].contiguous())
+            sp = self.relu(self.bns[0](sp))
+            out = sp
+            for i in range(1, self.scales - 1):
+                if self.stage_type == 'stage':
+                    sp = spx[i]
+                else:
+                    sp = sp + spx[i]
+                sp = self.convs[i](sp.contiguous())
+                sp = self.relu(self.bns[i](sp))
+                out = torch.cat((out, sp), 1)
+
+            if self.stage_type == 'normal' or self.conv2_stride == 1:
+                out = torch.cat((out, spx[self.scales - 1]), 1)
+            elif self.stage_type == 'stage':
+                out = torch.cat((out, self.pool(spx[self.scales - 1])), 1)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Res2Layer(Sequential):
+    """Res2Layer to build Res2Net style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 scales=4,
+                 base_width=26,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.AvgPool2d(
+                    kernel_size=stride,
+                    stride=stride,
+                    ceil_mode=True,
+                    count_include_pad=False),
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=1,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1],
+            )
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                scales=scales,
+                base_width=base_width,
+                stage_type='stage',
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    scales=scales,
+                    base_width=base_width,
+                    **kwargs))
+        super(Res2Layer, self).__init__(*layers)
+
+
+@BACKBONES.register_module()
+class Res2Net(ResNet):
+    """Res2Net backbone.
+
+    Args:
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+        depth (int): Depth of res2net, from {50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Res2net stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Res2Net
+        >>> import torch
+        >>> self = Res2Net(depth=50, scales=4, base_width=26)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottle2neck, (3, 4, 6, 3)),
+        101: (Bottle2neck, (3, 4, 23, 3)),
+        152: (Bottle2neck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 scales=4,
+                 base_width=26,
+                 style='pytorch',
+                 deep_stem=True,
+                 avg_down=True,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        self.scales = scales
+        self.base_width = base_width
+        super(Res2Net, self).__init__(
+            style='pytorch',
+            deep_stem=True,
+            avg_down=True,
+            pretrained=pretrained,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return Res2Layer(
+            scales=self.scales,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmdet/models/backbones/resnest.py b/mmdet/models/backbones/resnest.py
new file mode 100755
index 0000000..69629b9
--- /dev/null
+++ b/mmdet/models/backbones/resnest.py
@@ -0,0 +1,322 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(BaseModule):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        channels (int): Number of intermediate channels.
+        kernel_size (int | tuple[int]): Size of the convolution kernel.
+        stride (int | tuple[int]): Stride of the convolution.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+        dilation (int | tuple[int]): Spacing between kernel elements.
+        groups (int): Number of blocked connections from input channels to
+            output channels.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 init_cfg=None):
+        super(SplitAttentionConv2d, self).__init__(init_cfg)
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        # To be consistent with original implementation, starting from 0
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        base_width (int): Base of width in terms of base channels. Default: 4.
+        base_channels (int): Base of channels for calculating width.
+            Default: 64.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SplitAttentionConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super(ResNeSt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/mmdet/models/backbones/resnet.py b/mmdet/models/backbones/resnet.py
new file mode 100755
index 0000000..1eaaae6
--- /dev/null
+++ b/mmdet/models/backbones/resnet.py
@@ -0,0 +1,672 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(out)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
diff --git a/mmdet/models/backbones/resnetclip.py b/mmdet/models/backbones/resnetclip.py
new file mode 100755
index 0000000..fbb4186
--- /dev/null
+++ b/mmdet/models/backbones/resnetclip.py
@@ -0,0 +1,763 @@
+# --------------------------------------------------------
+# PODA: Prompt-driven Zero-shot Domain Adaptation
+# Copyright (c) 2024 valeo.ai, astra-vision 
+#
+# Written by Tuan-Hung Vu
+# --------------------------------------------------------
+
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+
+from collections import OrderedDict
+import random
+import torch
+
+def calc_mean_std(feat, eps=1e-5):
+    # eps is a small value added to the variance to avoid divide-by-zero.
+    size = feat.size()
+    assert (len(size) == 4)
+    N, C = size[:2]
+    feat_var = feat.reshape(N, C, -1).var(dim=2) + eps
+    feat_std = feat_var.sqrt().view(N, C, 1, 1)
+    feat_mean = feat.reshape(N, C, -1).mean(dim=2).view(N, C, 1, 1)
+    return feat_mean, feat_std
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        # self.downsample = downsample
+        self.downsample = None
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(out)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ModifiedResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=True,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None,
+                 target_domain=None,
+                 augmented_layer=None,
+                 mixing_style=True):
+        super(ModifiedResNet, self).__init__(init_cfg)
+        self.all_domains = ['fog', 'diverse_dayfog_101', 'diverse_night_101', 'diverse_nightrain_101', 'diverse_duskrain_101']
+        self.all_domains_featpaths = [
+            './augmented_feats/fog_f1_templates_100it/',
+            './augmented_feats/OD_aug_iccv/an_image_taken_on_a_fog_day_RN101/',
+            './augmented_feats/OD_aug_iccv/an_image_taken_on_night_RN101/',
+            './augmented_feats/OD_aug_iccv/an_image_taken_on_a_rain_night_RN101/',
+            './augmented_feats/OD_aug_iccv/an_image_taken_on_a_rain_evening_RN101/']
+        assert target_domain is None or target_domain in self.all_domains or target_domain == 'all', 'unknown target domain' # PODA
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+        self.is_training = True
+
+        # PODA - loading optimized styles
+        self.target_domain = target_domain
+        self.augmented_mus = []
+        self.augmented_stds = []
+        self.augmented_layer = None
+        if self.target_domain:
+            if self.target_domain == 'all':
+                target_domain_idx = range(len(self.all_domains))
+            else:
+                target_domain_idx = [self.all_domains.index(self.target_domain)]
+            assert augmented_layer is not None, 'not specifying augmented layer'
+            self.augmented_layer = augmented_layer
+            self.mixing_style = mixing_style
+            import glob
+            import pickle
+            for tidx in target_domain_idx:
+                PODA_optim_styles_root_dir = self.all_domains_featpaths[tidx]
+                for optim_style_file in glob.glob(PODA_optim_styles_root_dir + "*.pkl"):
+                    with open(optim_style_file, 'rb') as _file:
+                        stats = pickle.load(_file);
+                        self.augmented_mus.append(stats['mu_f1'])
+                        self.augmented_stds.append(stats['std_f1'])
+            print(f'Loaded {len(self.augmented_mus)} statistics')
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.conv1 = build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False)
+            self.bn1 = build_norm_layer(self.norm_cfg, stem_channels // 2)[1]
+            self.relu1 = nn.ReLU(inplace=True)
+            self.conv2 = build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False)
+            self.bn2 = build_norm_layer(self.norm_cfg, stem_channels // 2)[1]
+            self.relu2 = nn.ReLU(inplace=True)
+            self.conv3 = build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False)
+            self.bn3 = build_norm_layer(self.norm_cfg, stem_channels)[1]
+            self.relu3 = nn.ReLU(inplace=True)
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                # self.stem.eval()
+                self.conv1.eval()
+                self.bn1.eval()
+                self.conv2.eval()
+                self.bn2.eval()
+                self.conv3.eval()
+                self.bn3.eval()
+                for param in self.conv1.parameters():
+                    param.requires_grad = False
+                for param in self.bn1.parameters():
+                    param.requires_grad = False
+                for param in self.conv2.parameters():
+                    param.requires_grad = False
+                for param in self.bn2.parameters():
+                    param.requires_grad = False
+                for param in self.conv3.parameters():
+                    param.requires_grad = False
+                for param in self.bn3.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            # x = self.stem(x)
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.relu1(x)
+            x = self.conv2(x)
+            x = self.bn2(x)
+            x = self.relu2(x)
+            x = self.conv3(x)
+            x = self.bn3(x)
+            x = self.relu3(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        # x = self.maxpool(x)
+        x = self.avgpool(x)
+        outs = []
+
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if self.training and self.augmented_layer and self.augmented_layer == i+1:
+                feat_size = x.size()
+                mean, std = calc_mean_std(x)
+                x_norm = (x - mean.expand(feat_size)) / std.expand(feat_size)
+                rand_idx = random.randrange(len(self.augmented_mus))
+                mu_t_f1 = self.augmented_mus[rand_idx]
+                std_t_f1 = self.augmented_stds[rand_idx]
+                if self.mixing_style:
+                    mixing_alpha = torch.rand((mean.shape[0],mean.shape[1])).to('cuda').unsqueeze(-1).unsqueeze(-1)
+                else:
+                    mixing_alpha = torch.zeros((mean.shape[0],mean.shape[1])).to('cuda').unsqueeze(-1).unsqueeze(-1)
+                print('poda')
+                mu_mix = mixing_alpha * mean + (1-mixing_alpha) * mu_t_f1.to('cuda')
+                std_mix = mixing_alpha * std + (1-mixing_alpha) * std_t_f1.to('cuda')
+                x = (std_mix.expand(feat_size) * x_norm + mu_mix.expand(feat_size))
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ModifiedResNet, self).train(mode)
+        self._freeze_stages()
+        self.is_training = mode
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
\ No newline at end of file
diff --git a/mmdet/models/backbones/resnext.py b/mmdet/models/backbones/resnext.py
new file mode 100755
index 0000000..8675d7c
--- /dev/null
+++ b/mmdet/models/backbones/resnext.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        if self.with_plugins:
+            self._del_block_plugins(self.after_conv1_plugin_names +
+                                    self.after_conv2_plugin_names +
+                                    self.after_conv3_plugin_names)
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                width, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                width, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                self.planes * self.expansion, self.after_conv3_plugins)
+
+    def _del_block_plugins(self, plugin_names):
+        """delete plugins for block if exist.
+
+        Args:
+            plugin_names (list[str]): List of plugins name to delete.
+        """
+        assert isinstance(plugin_names, list)
+        for plugin_name in plugin_names:
+            del self._modules[plugin_name]
+
+
+@BACKBONES.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmdet/models/backbones/ssd_vgg.py b/mmdet/models/backbones/ssd_vgg.py
new file mode 100755
index 0000000..c15aeac
--- /dev/null
+++ b/mmdet/models/backbones/ssd_vgg.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import VGG
+from mmcv.runner import BaseModule
+
+from ..builder import BACKBONES
+from ..necks import ssd_neck
+
+
+@BACKBONES.register_module()
+class SSDVGG(VGG, BaseModule):
+    """VGG Backbone network for single-shot-detection.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_last_pool (bool): Whether to add a pooling layer at the last
+            of the model
+        ceil_mode (bool): When True, will use `ceil` instead of `floor`
+            to compute the output shape.
+        out_indices (Sequence[int]): Output from which stages.
+        out_feature_indices (Sequence[int]): Output from which feature map.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+        input_size (int, optional): Deprecated argumment.
+            Width and height of input, from {300, 512}.
+        l2_norm_scale (float, optional) : Deprecated argumment.
+            L2 normalization layer init scale.
+
+    Example:
+        >>> self = SSDVGG(input_size=300, depth=11)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 300, 300)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 19, 19)
+        (1, 512, 10, 10)
+        (1, 256, 5, 5)
+        (1, 256, 3, 3)
+        (1, 256, 1, 1)
+    """
+    extra_setting = {
+        300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
+        512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128),
+    }
+
+    def __init__(self,
+                 depth,
+                 with_last_pool=False,
+                 ceil_mode=True,
+                 out_indices=(3, 4),
+                 out_feature_indices=(22, 34),
+                 pretrained=None,
+                 init_cfg=None,
+                 input_size=None,
+                 l2_norm_scale=None):
+        # TODO: in_channels for mmcv.VGG
+        super(SSDVGG, self).__init__(
+            depth,
+            with_last_pool=with_last_pool,
+            ceil_mode=ceil_mode,
+            out_indices=out_indices)
+
+        self.features.add_module(
+            str(len(self.features)),
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+        self.features.add_module(
+            str(len(self.features)),
+            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.features.add_module(
+            str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.out_feature_indices = out_feature_indices
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+
+        if init_cfg is not None:
+            self.init_cfg = init_cfg
+        elif isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='BatchNorm2d'),
+                dict(type='Normal', std=0.01, layer='Linear'),
+            ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        if input_size is not None:
+            warnings.warn('DeprecationWarning: input_size is deprecated')
+        if l2_norm_scale is not None:
+            warnings.warn('DeprecationWarning: l2_norm_scale in VGG is '
+                          'deprecated, it has been moved to SSDNeck.')
+
+    def init_weights(self, pretrained=None):
+        super(VGG, self).init_weights()
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for i, layer in enumerate(self.features):
+            x = layer(x)
+            if i in self.out_feature_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+
+class L2Norm(ssd_neck.L2Norm):
+
+    def __init__(self, **kwargs):
+        super(L2Norm, self).__init__(**kwargs)
+        warnings.warn('DeprecationWarning: L2Norm in ssd_vgg.py '
+                      'is deprecated, please use L2Norm in '
+                      'mmdet/models/necks/ssd_neck.py instead')
diff --git a/mmdet/models/backbones/swin.py b/mmdet/models/backbones/swin.py
new file mode 100755
index 0000000..b8eccfc
--- /dev/null
+++ b/mmdet/models/backbones/swin.py
@@ -0,0 +1,772 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer, constant_init, trunc_normal_init
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmcv.cnn.utils.weight_init import trunc_normal_
+from mmcv.runner import BaseModule, ModuleList, _load_checkpoint
+from mmcv.utils import to_2tuple
+
+from ...utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils.ckpt_convert import swin_converter
+from ..utils.transformer import PatchEmbed, PatchMerging
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+        self.init_cfg = init_cfg
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(BaseModule):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+
+        super(SwinBlock, self).__init__()
+
+        self.init_cfg = init_cfg
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@BACKBONES.register_module()
+class SwinTransformer(BaseModule):
+    """ Swin Transformer
+    A PyTorch implement of : `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted Windows`  -
+        https://arxiv.org/abs/2103.14030
+
+    Inspiration from
+    https://github.com/microsoft/Swin-Transformer
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int | float): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            Default: -1 (-1 means not freezing any parameters).
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 pretrained=None,
+                 convert_weights=False,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        self.convert_weights = convert_weights
+        self.frozen_stages = frozen_stages
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        super(SwinTransformer, self).__init__(init_cfg=init_cfg)
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, embed_dims, patch_row, patch_col)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=int(mlp_ratio * in_channels),
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        logger = get_root_logger()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = _load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+            if self.convert_weights:
+                # supported loading weight from original repo,
+                _state_dict = swin_converter(_state_dict)
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    logger.warning('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                table_current = self.state_dict()[table_key]
+                L1, nH1 = table_pretrained.size()
+                L2, nH2 = table_current.size()
+                if nH1 != nH2:
+                    logger.warning(f'Error in loading {table_key}, pass')
+                elif L1 != L2:
+                    S1 = int(L1**0.5)
+                    S2 = int(L2**0.5)
+                    table_pretrained_resized = F.interpolate(
+                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
+                        size=(S2, S2),
+                        mode='bicubic')
+                    state_dict[table_key] = table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, False)
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            h, w = self.absolute_pos_embed.shape[1:3]
+            if hw_shape[0] != h or hw_shape[1] != w:
+                absolute_pos_embed = F.interpolate(
+                    self.absolute_pos_embed,
+                    size=hw_shape,
+                    mode='bicubic',
+                    align_corners=False).flatten(2).transpose(1, 2)
+            else:
+                absolute_pos_embed = self.absolute_pos_embed.flatten(
+                    2).transpose(1, 2)
+            x = x + absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
diff --git a/mmdet/models/backbones/trident_resnet.py b/mmdet/models/backbones/trident_resnet.py
new file mode 100755
index 0000000..013ba64
--- /dev/null
+++ b/mmdet/models/backbones/trident_resnet.py
@@ -0,0 +1,298 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.backbones.resnet import Bottleneck, ResNet
+from mmdet.models.builder import BACKBONES
+
+
+class TridentConv(BaseModule):
+    """Trident Convolution Module.
+
+    Args:
+        in_channels (int): Number of channels in input.
+        out_channels (int): Number of channels in output.
+        kernel_size (int): Size of convolution kernel.
+        stride (int, optional): Convolution stride. Default: 1.
+        trident_dilations (tuple[int, int, int], optional): Dilations of
+            different trident branch. Default: (1, 2, 3).
+        test_branch_idx (int, optional): In inference, all 3 branches will
+            be used if `test_branch_idx==-1`, otherwise only branch with
+            index `test_branch_idx` will be used. Default: 1.
+        bias (bool, optional): Whether to use bias in convolution or not.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 trident_dilations=(1, 2, 3),
+                 test_branch_idx=1,
+                 bias=False,
+                 init_cfg=None):
+        super(TridentConv, self).__init__(init_cfg)
+        self.num_branch = len(trident_dilations)
+        self.with_bias = bias
+        self.test_branch_idx = test_branch_idx
+        self.stride = _pair(stride)
+        self.kernel_size = _pair(kernel_size)
+        self.paddings = _pair(trident_dilations)
+        self.dilations = trident_dilations
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+    def extra_repr(self):
+        tmpstr = f'in_channels={self.in_channels}'
+        tmpstr += f', out_channels={self.out_channels}'
+        tmpstr += f', kernel_size={self.kernel_size}'
+        tmpstr += f', num_branch={self.num_branch}'
+        tmpstr += f', test_branch_idx={self.test_branch_idx}'
+        tmpstr += f', stride={self.stride}'
+        tmpstr += f', paddings={self.paddings}'
+        tmpstr += f', dilations={self.dilations}'
+        tmpstr += f', bias={self.bias}'
+        return tmpstr
+
+    def forward(self, inputs):
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, self.stride, padding,
+                         dilation) for input, dilation, padding in zip(
+                             inputs, self.dilations, self.paddings)
+            ]
+        else:
+            assert len(inputs) == 1
+            outputs = [
+                F.conv2d(inputs[0], self.weight, self.bias, self.stride,
+                         self.paddings[self.test_branch_idx],
+                         self.dilations[self.test_branch_idx])
+            ]
+
+        return outputs
+
+
+# Since TridentNet is defined over ResNet50 and ResNet101, here we
+# only support TridentBottleneckBlock.
+class TridentBottleneck(Bottleneck):
+    """BottleBlock for TridentResNet.
+
+    Args:
+        trident_dilations (tuple[int, int, int]): Dilations of different
+            trident branch.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        concat_output (bool): Whether to concat the output list to a Tensor.
+            `True` only in the last Block.
+    """
+
+    def __init__(self, trident_dilations, test_branch_idx, concat_output,
+                 **kwargs):
+
+        super(TridentBottleneck, self).__init__(**kwargs)
+        self.trident_dilations = trident_dilations
+        self.num_branch = len(trident_dilations)
+        self.concat_output = concat_output
+        self.test_branch_idx = test_branch_idx
+        self.conv2 = TridentConv(
+            self.planes,
+            self.planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            bias=False,
+            trident_dilations=self.trident_dilations,
+            test_branch_idx=test_branch_idx,
+            init_cfg=dict(
+                type='Kaiming',
+                distribution='uniform',
+                mode='fan_in',
+                override=dict(name='conv2')))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_branch = (
+                self.num_branch
+                if self.training or self.test_branch_idx == -1 else 1)
+            identity = x
+            if not isinstance(x, list):
+                x = (x, ) * num_branch
+                identity = x
+                if self.downsample is not None:
+                    identity = [self.downsample(b) for b in x]
+
+            out = [self.conv1(b) for b in x]
+            out = [self.norm1(b) for b in out]
+            out = [self.relu(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = [self.norm2(b) for b in out]
+            out = [self.relu(b) for b in out]
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv2_plugin_names)
+
+            out = [self.conv3(b) for b in out]
+            out = [self.norm3(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv3_plugin_names)
+
+            out = [
+                out_b + identity_b for out_b, identity_b in zip(out, identity)
+            ]
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = [self.relu(b) for b in out]
+        if self.concat_output:
+            out = torch.cat(out, dim=0)
+        return out
+
+
+def make_trident_res_layer(block,
+                           inplanes,
+                           planes,
+                           num_blocks,
+                           stride=1,
+                           trident_dilations=(1, 2, 3),
+                           style='pytorch',
+                           with_cp=False,
+                           conv_cfg=None,
+                           norm_cfg=dict(type='BN'),
+                           dcn=None,
+                           plugins=None,
+                           test_branch_idx=-1):
+    """Build Trident Res Layers."""
+
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = []
+        conv_stride = stride
+        downsample.extend([
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=conv_stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1]
+        ])
+        downsample = nn.Sequential(*downsample)
+
+    layers = []
+    for i in range(num_blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride if i == 0 else 1,
+                trident_dilations=trident_dilations,
+                downsample=downsample if i == 0 else None,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=plugins,
+                test_branch_idx=test_branch_idx,
+                concat_output=True if i == num_blocks - 1 else False))
+        inplanes = planes * block.expansion
+    return nn.Sequential(*layers)
+
+
+@BACKBONES.register_module()
+class TridentResNet(ResNet):
+    """The stem layer, stage 1 and stage 2 in Trident ResNet are identical to
+    ResNet, while in stage 3, Trident BottleBlock is utilized to replace the
+    normal BottleBlock to yield trident output. Different branch shares the
+    convolution weight but uses different dilations to achieve multi-scale
+    output.
+
+                               / stage3(b0) \
+    x - stem - stage1 - stage2 - stage3(b1) - output
+                               \ stage3(b2) /
+
+    Args:
+        depth (int): Depth of resnet, from {50, 101, 152}.
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        trident_dilations (tuple[int]): Dilations of different trident branch.
+            len(trident_dilations) should be equal to num_branch.
+    """  # noqa
+
+    def __init__(self, depth, num_branch, test_branch_idx, trident_dilations,
+                 **kwargs):
+
+        assert num_branch == len(trident_dilations)
+        assert depth in (50, 101, 152)
+        super(TridentResNet, self).__init__(depth, **kwargs)
+        assert self.num_stages == 3
+        self.test_branch_idx = test_branch_idx
+        self.num_branch = num_branch
+
+        last_stage_idx = self.num_stages - 1
+        stride = self.strides[last_stage_idx]
+        dilation = trident_dilations
+        dcn = self.dcn if self.stage_with_dcn[last_stage_idx] else None
+        if self.plugins is not None:
+            stage_plugins = self.make_stage_plugins(self.plugins,
+                                                    last_stage_idx)
+        else:
+            stage_plugins = None
+        planes = self.base_channels * 2**last_stage_idx
+        res_layer = make_trident_res_layer(
+            TridentBottleneck,
+            inplanes=(self.block.expansion * self.base_channels *
+                      2**(last_stage_idx - 1)),
+            planes=planes,
+            num_blocks=self.stage_blocks[last_stage_idx],
+            stride=stride,
+            trident_dilations=dilation,
+            style=self.style,
+            with_cp=self.with_cp,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn,
+            plugins=stage_plugins,
+            test_branch_idx=self.test_branch_idx)
+
+        layer_name = f'layer{last_stage_idx + 1}'
+
+        self.__setattr__(layer_name, res_layer)
+        self.res_layers.pop(last_stage_idx)
+        self.res_layers.insert(last_stage_idx, layer_name)
+
+        self._freeze_stages()
diff --git a/mmdet/models/builder.py b/mmdet/models/builder.py
new file mode 100755
index 0000000..ace6209
--- /dev/null
+++ b/mmdet/models/builder.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.utils import Registry
+
+MODELS = Registry('models', parent=MMCV_MODELS)
+
+BACKBONES = MODELS
+NECKS = MODELS
+ROI_EXTRACTORS = MODELS
+SHARED_HEADS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+DETECTORS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return ROI_EXTRACTORS.build(cfg)
+
+
+def build_shared_head(cfg):
+    """Build shared head."""
+    return SHARED_HEADS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    return DETECTORS.build(
+        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/mmdet/models/dense_heads/__init__.py b/mmdet/models/dense_heads/__init__.py
new file mode 100755
index 0000000..9c60ae1
--- /dev/null
+++ b/mmdet/models/dense_heads/__init__.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_free_head import AnchorFreeHead
+from .anchor_head import AnchorHead
+from .ascend_anchor_head import AscendAnchorHead
+from .ascend_retina_head import AscendRetinaHead
+from .ascend_ssd_head import AscendSSDHead
+from .atss_head import ATSSHead
+from .autoassign_head import AutoAssignHead
+from .cascade_rpn_head import CascadeRPNHead, StageCascadeRPNHead
+from .centernet_head import CenterNetHead
+from .centripetal_head import CentripetalHead
+from .corner_head import CornerHead
+from .ddod_head import DDODHead
+from .deformable_detr_head import DeformableDETRHead
+from .detr_head import DETRHead
+from .embedding_rpn_head import EmbeddingRPNHead
+from .fcos_head import FCOSHead
+from .fovea_head import FoveaHead
+from .free_anchor_retina_head import FreeAnchorRetinaHead
+from .fsaf_head import FSAFHead
+from .ga_retina_head import GARetinaHead
+from .ga_rpn_head import GARPNHead
+from .gfl_head import GFLHead
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+from .lad_head import LADHead
+from .ld_head import LDHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
+from .nasfcos_head import NASFCOSHead
+from .paa_head import PAAHead
+from .pisa_retinanet_head import PISARetinaHead
+from .pisa_ssd_head import PISASSDHead
+from .reppoints_head import RepPointsHead
+from .retina_head import RetinaHead
+from .retina_sepbn_head import RetinaSepBNHead
+from .rpn_head import RPNHead
+from .sabl_retina_head import SABLRetinaHead
+from .solo_head import DecoupledSOLOHead, DecoupledSOLOLightHead, SOLOHead
+from .solov2_head import SOLOV2Head
+from .ssd_head import SSDHead
+from .tood_head import TOODHead
+from .vfnet_head import VFNetHead
+from .yolact_head import YOLACTHead, YOLACTProtonet, YOLACTSegmHead
+from .yolo_head import YOLOV3Head
+from .yolof_head import YOLOFHead
+from .yolox_head import YOLOXHead
+
+__all__ = [
+    'AnchorFreeHead', 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption',
+    'RPNHead', 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead',
+    'SSDHead', 'FCOSHead', 'RepPointsHead', 'FoveaHead',
+    'FreeAnchorRetinaHead', 'ATSSHead', 'FSAFHead', 'NASFCOSHead',
+    'PISARetinaHead', 'PISASSDHead', 'GFLHead', 'CornerHead', 'YOLACTHead',
+    'YOLACTSegmHead', 'YOLACTProtonet', 'YOLOV3Head', 'PAAHead',
+    'SABLRetinaHead', 'CentripetalHead', 'VFNetHead', 'StageCascadeRPNHead',
+    'CascadeRPNHead', 'EmbeddingRPNHead', 'LDHead', 'AutoAssignHead',
+    'DETRHead', 'YOLOFHead', 'DeformableDETRHead', 'SOLOHead',
+    'DecoupledSOLOHead', 'CenterNetHead', 'YOLOXHead',
+    'DecoupledSOLOLightHead', 'LADHead', 'TOODHead', 'MaskFormerHead',
+    'Mask2FormerHead', 'SOLOV2Head', 'DDODHead', 'AscendAnchorHead',
+    'AscendRetinaHead', 'AscendSSDHead'
+]
diff --git a/mmdet/models/dense_heads/anchor_free_head.py b/mmdet/models/dense_heads/anchor_free_head.py
new file mode 100755
index 0000000..b0460b9
--- /dev/null
+++ b/mmdet/models/dense_heads/anchor_free_head.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import force_fp32
+
+from mmdet.core import build_bbox_coder, multi_apply
+from mmdet.core.anchor.point_generator import MlvlPointGenerator
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class AnchorFreeHead(BaseDenseHead, BBoxTestMixin):
+    """Anchor-free head (FCOS, Fovea, RepPoints, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (tuple): Downsample factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        bbox_coder (dict): Config of bbox coder. Defaults
+            'DistancePointBBoxCoder'.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 stacked_convs=4,
+                 strides=(4, 8, 16, 32, 64),
+                 dcn_on_last_conv=False,
+                 conv_bias='auto',
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+                 bbox_coder=dict(type='DistancePointBBoxCoder'),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01))):
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.num_classes = num_classes
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        self.prior_generator = MlvlPointGenerator(strides)
+
+        # In order to keep a more general interface and be consistent with
+        # anchor_head. We can think of point like one anchor
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Hack some keys of the model state dict so that can load checkpoints
+        of previous version."""
+        version = local_metadata.get('version', None)
+        if version is None:
+            # the key is different in early versions
+            # for example, 'fcos_cls' become 'conv_cls' now
+            bbox_head_keys = [
+                k for k in state_dict.keys() if k.startswith(prefix)
+            ]
+            ori_predictor_keys = []
+            new_predictor_keys = []
+            # e.g. 'fcos_cls' or 'fcos_reg'
+            for key in bbox_head_keys:
+                ori_predictor_keys.append(key)
+                key = key.split('.')
+                conv_name = None
+                if key[1].endswith('cls'):
+                    conv_name = 'conv_cls'
+                elif key[1].endswith('reg'):
+                    conv_name = 'conv_reg'
+                elif key[1].endswith('centerness'):
+                    conv_name = 'conv_centerness'
+                else:
+                    assert NotImplementedError
+                if conv_name is not None:
+                    key[1] = conv_name
+                    new_predictor_keys.append('.'.join(key))
+                else:
+                    ori_predictor_keys.pop(-1)
+            for i in range(len(new_predictor_keys)):
+                state_dict[new_predictor_keys[i]] = state_dict.pop(
+                    ori_predictor_keys[i])
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores and bbox predictions.
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * 4.
+        """
+        return multi_apply(self.forward_single, feats)[:2]
+
+    def forward_single(self, x):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, features
+                after classification and regression conv layers, some
+                models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        return cls_score, bbox_pred, cls_feat, reg_feat
+
+    @abstractmethod
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+        """
+        raise NotImplementedError
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points of a single scale level.
+
+        This function will be deprecated soon.
+        """
+
+        warnings.warn(
+            '`_get_points_single` in `AnchorFreeHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of a single level feature map '
+            'with `self.prior_generator.single_level_grid_priors` ')
+
+        h, w = featmap_size
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        x_range = torch.arange(w, device=device).to(dtype)
+        y_range = torch.arange(h, device=device).to(dtype)
+        y, x = torch.meshgrid(y_range, x_range)
+        if flatten:
+            y = y.flatten()
+            x = x.flatten()
+        return y, x
+
+    def get_points(self, featmap_sizes, dtype, device, flatten=False):
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+
+        Returns:
+            tuple: points of each image.
+        """
+        warnings.warn(
+            '`get_points` in `AnchorFreeHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of all levels '
+            'with `self.prior_generator.grid_priors` ')
+
+        mlvl_points = []
+        for i in range(len(featmap_sizes)):
+            mlvl_points.append(
+                self._get_points_single(featmap_sizes[i], self.strides[i],
+                                        dtype, device, flatten))
+        return mlvl_points
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/mmdet/models/dense_heads/anchor_head.py b/mmdet/models/dense_heads/anchor_head.py
new file mode 100755
index 0000000..d1bfab6
--- /dev/null
+++ b/mmdet/models/dense_heads/anchor_head.py
@@ -0,0 +1,542 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_assigner, build_bbox_coder,
+                        build_prior_generator, build_sampler, images_to_levels,
+                        multi_apply, unmap)
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class AnchorHead(BaseDenseHead, BBoxTestMixin):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     scales=[8, 16, 32],
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=(.0, .0, .0, .0),
+                     target_stds=(1.0, 1.0, 1.0, 1.0)),
+                 reg_decoded_bbox=False,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(type='Normal', layer='Conv2d', std=0.01)):
+        super(AnchorHead, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        if self.cls_out_channels <= 0:
+            raise ValueError(f'num_classes={num_classes} is too small')
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            if hasattr(self.train_cfg,
+                       'sampler') and self.train_cfg.sampler.type.split(
+                           '.')[-1] != 'PseudoSampler':
+                self.sampling = True
+                sampler_cfg = self.train_cfg.sampler
+                # avoid BC-breaking
+                if loss_cls['type'] in [
+                        'FocalLoss', 'GHMC', 'QualityFocalLoss'
+                ]:
+                    warnings.warn(
+                        'DeprecationWarning: Determining whether to sampling'
+                        'by loss type is deprecated, please delete sampler in'
+                        'your config when using `FocalLoss`, `GHMC`, '
+                        '`QualityFocalLoss` or other FocalLoss variant.')
+                    self.sampling = False
+                    sampler_cfg = dict(type='PseudoSampler')
+            else:
+                self.sampling = False
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.fp16_enabled = False
+
+        self.prior_generator = build_prior_generator(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        self._init_layers()
+
+    @property
+    def num_anchors(self):
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'for consistency or also use '
+                      '`num_base_priors` instead')
+        return self.prior_generator.num_base_priors[0]
+
+    @property
+    def anchor_generator(self):
+        warnings.warn('DeprecationWarning: anchor_generator is deprecated, '
+                      'please use "prior_generator" instead')
+        return self.prior_generator
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.conv_cls = nn.Conv2d(self.in_channels,
+                                  self.num_base_priors * self.cls_out_channels,
+                                  1)
+        self.conv_reg = nn.Conv2d(self.in_channels, self.num_base_priors * 4,
+                                  1)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        return cls_score, bbox_pred
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * 4.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors
+
+        Returns:
+            tuple:
+                anchor_list (list[Tensor]): Anchors of each image.
+                valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def _get_targets_single(self,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            img_meta (dict): Meta info of the image.
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                labels_list (list[Tensor]): Labels of each level
+                label_weights_list (list[Tensor]): Label weights of each level
+                bbox_targets_list (list[Tensor]): BBox targets of each level
+                bbox_weights_list (list[Tensor]): BBox weights of each level
+                num_total_pos (int): Number of positive samples in all images
+                num_total_neg (int): Number of negative samples in all images
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        assign_result = self.assigner.assign(
+            anchors, gt_bboxes, gt_bboxes_ignore,
+            None if self.sampling else gt_labels)
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True,
+                    return_sampling_results=False):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - num_total_pos (int): Number of positive samples in all
+                  images.
+                - num_total_neg (int): Number of negative samples in all
+                  images.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            gt_bboxes_list,
+            gt_bboxes_ignore_list,
+            gt_labels_list,
+            img_metas,
+            label_channels=label_channels,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])  # user-added return values
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, num_total_pos, num_total_neg)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        for i, r in enumerate(rest_results):  # user-added return values
+            rest_results[i] = images_to_levels(r, num_level_anchors)
+
+        return res + tuple(rest_results)
+
+    def loss_single(self, cls_score, bbox_pred, anchors, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (N, num_total_anchors, 4).
+            num_total_samples (int): If sampling, num total samples equal to
+                the number of total anchors; Otherwise, it is the number of
+                positive anchors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, 4)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+        return loss_cls, loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5), where
+                5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,), The length of list should always be 1.
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/mmdet/models/dense_heads/ascend_anchor_head.py b/mmdet/models/dense_heads/ascend_anchor_head.py
new file mode 100755
index 0000000..7d100ba
--- /dev/null
+++ b/mmdet/models/dense_heads/ascend_anchor_head.py
@@ -0,0 +1,389 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ...core.bbox.assigners import AscendMaxIoUAssigner
+from ...core.bbox.samplers import PseudoSampler
+from ...utils import (batch_images_to_levels, get_max_num_gt_division_factor,
+                      masked_fill)
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class AscendAnchorHead(AnchorHead):
+    """Ascend Anchor-based head (RetinaNet, SSD, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     scales=[8, 16, 32],
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=(.0, .0, .0, .0),
+                     target_stds=(1.0, 1.0, 1.0, 1.0)),
+                 reg_decoded_bbox=False,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(type='Normal', layer='Conv2d', std=0.01)):
+        super(AscendAnchorHead, self).__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            anchor_generator=anchor_generator,
+            bbox_coder=bbox_coder,
+            reg_decoded_bbox=reg_decoded_bbox,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def get_batch_gt_bboxes(self, gt_bboxes_list, num_images, gt_nums, device,
+                            max_gt_labels):
+        """Get ground truth bboxes of all image.
+
+        Args:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            num_images (int): The num of images.
+            gt_nums(list[int]): The ground truth bboxes num of each image.
+            device (torch.device | str): Device for returned tensors
+            max_gt_labels(int): The max ground truth bboxes num of all image.
+        Returns:
+            batch_gt_bboxes: (Tensor): Ground truth bboxes of all image.
+        """
+        # a static ground truth boxes.
+        # Save static gt. Related to Ascend. Helps improve performance
+        if not hasattr(self, 'batch_gt_bboxes'):
+            self.batch_gt_bboxes = {}
+        # a min anchor filled the excess anchor
+        if not hasattr(self, 'min_anchor'):
+            self.min_anchor = (-1354, -1344)
+        if gt_bboxes_list is None:
+            batch_gt_bboxes = None
+        else:
+            if self.batch_gt_bboxes.get(max_gt_labels) is None:
+                batch_gt_bboxes = torch.zeros((num_images, max_gt_labels, 4),
+                                              dtype=gt_bboxes_list[0].dtype,
+                                              device=device)
+                batch_gt_bboxes[:, :, :2] = self.min_anchor[0]
+                batch_gt_bboxes[:, :, 2:] = self.min_anchor[1]
+                self.batch_gt_bboxes[max_gt_labels] = batch_gt_bboxes.clone()
+            else:
+                batch_gt_bboxes = self.batch_gt_bboxes.get(
+                    max_gt_labels).clone()
+            for index_imgs, gt_bboxes in enumerate(gt_bboxes_list):
+                batch_gt_bboxes[index_imgs, :gt_nums[index_imgs]] = gt_bboxes
+        return batch_gt_bboxes
+
+    def get_batch_gt_bboxes_ignore(self, gt_bboxes_ignore_list, num_images,
+                                   gt_nums, device):
+        """Ground truth bboxes to be ignored of all image.
+
+        Args:
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            num_images (int): The num of images.
+            gt_nums(list[int]): The ground truth bboxes num of each image.
+            device (torch.device | str): Device for returned tensors
+        Returns:
+            batch_gt_bboxes_ignore: (Tensor): Ground truth bboxes to be
+                ignored of all image.
+        """
+        # TODO: support gt_bboxes_ignore_list
+        if gt_bboxes_ignore_list is None:
+            batch_gt_bboxes_ignore = None
+        else:
+            raise RuntimeError('gt_bboxes_ignore not support yet')
+        return batch_gt_bboxes_ignore
+
+    def get_batch_gt_labels(self, gt_labels_list, num_images, gt_nums, device,
+                            max_gt_labels):
+        """Ground truth bboxes to be ignored of all image.
+
+        Args:
+            gt_labels_list (list[Tensor]): Ground truth labels.
+            num_images (int): The num of images.
+            gt_nums(list[int]): The ground truth bboxes num of each image.
+            device (torch.device | str): Device for returned tensors
+        Returns:
+            batch_gt_labels: (Tensor): Ground truth labels of all image.
+        """
+        if gt_labels_list is None:
+            batch_gt_labels = None
+        else:
+            batch_gt_labels = torch.zeros((num_images, max_gt_labels),
+                                          dtype=gt_labels_list[0].dtype,
+                                          device=device)
+            for index_imgs, gt_labels in enumerate(gt_labels_list):
+                batch_gt_labels[index_imgs, :gt_nums[index_imgs]] = gt_labels
+
+        return batch_gt_labels
+
+    def _get_targets_concat(self,
+                            batch_anchors,
+                            batch_valid_flags,
+                            batch_gt_bboxes,
+                            batch_gt_bboxes_ignore,
+                            batch_gt_labels,
+                            img_metas,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in all
+        images.
+
+        Args:
+            batch_anchors (Tensor): anchors of all image, which are
+                concatenated into a single tensor of
+                shape (num_imgs, num_anchors ,4).
+            batch_valid_flags (Tensor): valid flags of all image,
+                which are concatenated into a single tensor of
+                    shape (num_imgs, num_anchors,).
+            batch_gt_bboxes (Tensor): Ground truth bboxes of all image,
+                shape (num_imgs, max_gt_nums, 4).
+            batch_gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_imgs, num_ignored_gts, 4).
+            batch_gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_imgs, max_gt_nums,).
+            img_metas (list[dict]): Meta info of each image.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                batch_labels (Tensor): Labels of all level
+                batch_label_weights (Tensor): Label weights of all level
+                batch_bbox_targets (Tensor): BBox targets of all level
+                batch_bbox_weights (Tensor): BBox weights of all level
+                batch_pos_mask (Tensor): Positive samples mask in all images
+                batch_neg_mask (Tensor): Negative samples mask in all images
+                sampling_result (Sampling): The result of sampling,
+                    default: None.
+        """
+        num_imgs, num_anchors, _ = batch_anchors.size()
+        # assign gt and sample batch_anchors
+        assign_result = self.assigner.assign(
+            batch_anchors,
+            batch_gt_bboxes,
+            batch_gt_bboxes_ignore,
+            None if self.sampling else batch_gt_labels,
+            batch_bboxes_ignore_mask=batch_valid_flags)
+        # TODO: support sampling_result
+        sampling_result = None
+        batch_pos_mask = assign_result.batch_pos_mask
+        batch_neg_mask = assign_result.batch_neg_mask
+        batch_anchor_gt_indes = assign_result.batch_anchor_gt_indes
+        batch_anchor_gt_labels = assign_result.batch_anchor_gt_labels
+
+        batch_anchor_gt_bboxes = torch.zeros(
+            batch_anchors.size(),
+            dtype=batch_anchors.dtype,
+            device=batch_anchors.device)
+        for index_imgs in range(num_imgs):
+            batch_anchor_gt_bboxes[index_imgs] = torch.index_select(
+                batch_gt_bboxes[index_imgs], 0,
+                batch_anchor_gt_indes[index_imgs])
+
+        batch_bbox_targets = torch.zeros_like(batch_anchors)
+        batch_bbox_weights = torch.zeros_like(batch_anchors)
+        batch_labels = batch_anchors.new_full((num_imgs, num_anchors),
+                                              self.num_classes,
+                                              dtype=torch.int)
+        batch_label_weights = batch_anchors.new_zeros((num_imgs, num_anchors),
+                                                      dtype=torch.float)
+
+        if not self.reg_decoded_bbox:
+            batch_pos_bbox_targets = self.bbox_coder.encode(
+                batch_anchors, batch_anchor_gt_bboxes)
+        else:
+            batch_pos_bbox_targets = batch_anchor_gt_bboxes
+
+        batch_bbox_targets = masked_fill(batch_bbox_targets,
+                                         batch_pos_mask.unsqueeze(2),
+                                         batch_pos_bbox_targets)
+        batch_bbox_weights = masked_fill(batch_bbox_weights,
+                                         batch_pos_mask.unsqueeze(2), 1.0)
+        if batch_gt_labels is None:
+            batch_labels = masked_fill(batch_labels, batch_pos_mask, 0.0)
+        else:
+            batch_labels = masked_fill(batch_labels, batch_pos_mask,
+                                       batch_anchor_gt_labels)
+        if self.train_cfg.pos_weight <= 0:
+            batch_label_weights = masked_fill(batch_label_weights,
+                                              batch_pos_mask, 1.0)
+        else:
+            batch_label_weights = masked_fill(batch_label_weights,
+                                              batch_pos_mask,
+                                              self.train_cfg.pos_weight)
+        batch_label_weights = masked_fill(batch_label_weights, batch_neg_mask,
+                                          1.0)
+        return (batch_labels, batch_label_weights, batch_bbox_targets,
+                batch_bbox_weights, batch_pos_mask, batch_neg_mask,
+                sampling_result)
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True,
+                    return_sampling_results=False,
+                    return_level=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+            return_sampling_results (bool): Whether to return the result of
+                sample.
+            return_level (bool): Whether to map outputs back to the levels
+                of feature map sizes.
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - num_total_pos (int): Number of positive samples in all
+                  images.
+                - num_total_neg (int): Number of negative samples in all
+                  images.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        assert gt_bboxes_ignore_list is None
+        assert unmap_outputs is True
+        assert return_sampling_results is False
+        assert self.train_cfg.allowed_border < 0
+        assert isinstance(self.assigner, AscendMaxIoUAssigner)
+        assert isinstance(self.sampler, PseudoSampler)
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        device = anchor_list[0][0].device
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        batch_anchor_list = []
+        batch_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            batch_anchor_list.append(torch.cat(anchor_list[i]))
+            batch_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+        batch_anchors = torch.cat(
+            [torch.unsqueeze(anchor, 0) for anchor in batch_anchor_list], 0)
+        batch_valid_flags = torch.cat([
+            torch.unsqueeze(batch_valid_flag, 0)
+            for batch_valid_flag in batch_valid_flag_list
+        ], 0)
+
+        gt_nums = [len(gt_bbox) for gt_bbox in gt_bboxes_list]
+        max_gt_nums = get_max_num_gt_division_factor(gt_nums)
+        batch_gt_bboxes = self.get_batch_gt_bboxes(gt_bboxes_list, num_imgs,
+                                                   gt_nums, device,
+                                                   max_gt_nums)
+        batch_gt_bboxes_ignore = self.get_batch_gt_bboxes_ignore(
+            gt_bboxes_ignore_list, num_imgs, gt_nums, device)
+        batch_gt_labels = self.get_batch_gt_labels(gt_labels_list, num_imgs,
+                                                   gt_nums, device,
+                                                   max_gt_nums)
+
+        results = self._get_targets_concat(
+            batch_anchors,
+            batch_valid_flags,
+            batch_gt_bboxes,
+            batch_gt_bboxes_ignore,
+            batch_gt_labels,
+            img_metas,
+            label_channels=label_channels,
+            unmap_outputs=unmap_outputs)
+
+        (batch_labels, batch_label_weights, batch_bbox_targets,
+         batch_bbox_weights, batch_pos_mask, batch_neg_mask,
+         sampling_result) = results[:7]
+        rest_results = list(results[7:])  # user-added return values
+
+        # sampled anchors of all images
+        min_num = torch.ones((num_imgs, ),
+                             dtype=torch.long,
+                             device=batch_pos_mask.device)
+        num_total_pos = torch.sum(
+            torch.max(torch.sum(batch_pos_mask, dim=1), min_num))
+        num_total_neg = torch.sum(
+            torch.max(torch.sum(batch_neg_mask, dim=1), min_num))
+        if return_level is True:
+            labels_list = batch_images_to_levels(batch_labels,
+                                                 num_level_anchors)
+            label_weights_list = batch_images_to_levels(
+                batch_label_weights, num_level_anchors)
+            bbox_targets_list = batch_images_to_levels(batch_bbox_targets,
+                                                       num_level_anchors)
+            bbox_weights_list = batch_images_to_levels(batch_bbox_weights,
+                                                       num_level_anchors)
+            res = (labels_list, label_weights_list, bbox_targets_list,
+                   bbox_weights_list, num_total_pos, num_total_neg)
+            if return_sampling_results:
+                res = res + (sampling_result, )
+            for i, r in enumerate(rest_results):  # user-added return values
+                rest_results[i] = batch_images_to_levels(r, num_level_anchors)
+
+            return res + tuple(rest_results)
+        else:
+            res = (batch_labels, batch_label_weights, batch_bbox_targets,
+                   batch_bbox_weights, batch_pos_mask, batch_neg_mask,
+                   sampling_result, num_total_pos, num_total_neg,
+                   batch_anchors)
+            return res
diff --git a/mmdet/models/dense_heads/ascend_retina_head.py b/mmdet/models/dense_heads/ascend_retina_head.py
new file mode 100755
index 0000000..159fe75
--- /dev/null
+++ b/mmdet/models/dense_heads/ascend_retina_head.py
@@ -0,0 +1,115 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import HEADS
+from .ascend_anchor_head import AscendAnchorHead
+from .retina_head import RetinaHead
+
+
+@HEADS.register_module()
+class AscendRetinaHead(RetinaHead, AscendAnchorHead):
+    r"""An anchor-based head used in `RetinaNet
+    <https://arxiv.org/pdf/1708.02002.pdf>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors.
+
+    Example:
+        >>> import torch
+        >>> self = RetinaHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == (self.num_classes)
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='retina_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        super(AscendRetinaHead, self).__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            stacked_convs=stacked_convs,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True,
+                    return_sampling_results=False,
+                    return_level=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+            return_sampling_results (bool): Whether to return the result of
+                sample.
+            return_level (bool): Whether to map outputs back to the levels
+                of feature map sizes.
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - num_total_pos (int): Number of positive samples in all
+                  images.
+                - num_total_neg (int): Number of negative samples in all
+                  images.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        return AscendAnchorHead.get_targets(
+            self, anchor_list, valid_flag_list, gt_bboxes_list, img_metas,
+            gt_bboxes_ignore_list, gt_labels_list, label_channels,
+            unmap_outputs, return_sampling_results, return_level)
diff --git a/mmdet/models/dense_heads/ascend_ssd_head.py b/mmdet/models/dense_heads/ascend_ssd_head.py
new file mode 100755
index 0000000..9e326b4
--- /dev/null
+++ b/mmdet/models/dense_heads/ascend_ssd_head.py
@@ -0,0 +1,328 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmcv.runner import force_fp32
+
+from ..builder import HEADS
+from ..losses import smooth_l1_loss
+from .ascend_anchor_head import AscendAnchorHead
+from .ssd_head import SSDHead
+
+
+@HEADS.register_module()
+class AscendSSDHead(SSDHead, AscendAnchorHead):
+    """Ascend SSD head used in https://arxiv.org/abs/1512.02325.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Default: 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Default: 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Default: False.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: None.
+        act_cfg (dict): Dictionary to construct and config activation layer.
+            Default: None.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes=80,
+                 in_channels=(512, 1024, 512, 256, 256, 256),
+                 stacked_convs=0,
+                 feat_channels=256,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 anchor_generator=dict(
+                     type='SSDAnchorGenerator',
+                     scale_major=False,
+                     input_size=300,
+                     strides=[8, 16, 32, 64, 100, 300],
+                     ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
+                     basesize_ratio_range=(0.1, 0.9)),
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[.0, .0, .0, .0],
+                     target_stds=[1.0, 1.0, 1.0, 1.0],
+                 ),
+                 reg_decoded_bbox=False,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier',
+                     layer='Conv2d',
+                     distribution='uniform',
+                     bias=0)):
+        super(AscendSSDHead, self).__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            stacked_convs=stacked_convs,
+            feat_channels=feat_channels,
+            use_depthwise=use_depthwise,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            anchor_generator=anchor_generator,
+            bbox_coder=bbox_coder,
+            reg_decoded_bbox=reg_decoded_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        assert self.reg_decoded_bbox is False, \
+            'reg_decoded_bbox only support False now.'
+
+    def get_static_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get static anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors
+
+        Returns:
+            tuple:
+                anchor_list (list[Tensor]): Anchors of each image.
+                valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        if not hasattr(self, 'static_anchors') or \
+                not hasattr(self, 'static_valid_flags'):
+            static_anchors, static_valid_flags = self.get_anchors(
+                featmap_sizes, img_metas, device)
+            self.static_anchors = static_anchors
+            self.static_valid_flags = static_valid_flags
+        return self.static_anchors, self.static_valid_flags
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True,
+                    return_sampling_results=False,
+                    return_level=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+            return_sampling_results (bool): Whether to return the result of
+                sample.
+            return_level (bool): Whether to map outputs back to the levels
+                of feature map sizes.
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - num_total_pos (int): Number of positive samples in all
+                  images.
+                - num_total_neg (int): Number of negative samples in all
+                  images.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        return AscendAnchorHead.get_targets(
+            self,
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes_list,
+            img_metas,
+            gt_bboxes_ignore_list,
+            gt_labels_list,
+            label_channels,
+            unmap_outputs,
+            return_sampling_results,
+            return_level,
+        )
+
+    def batch_loss(self, batch_cls_score, batch_bbox_pred, batch_anchor,
+                   batch_labels, batch_label_weights, batch_bbox_targets,
+                   batch_bbox_weights, batch_pos_mask, batch_neg_mask,
+                   num_total_samples):
+        """Compute loss of all images.
+
+        Args:
+            batch_cls_score (Tensor): Box scores for all image
+                Has shape (num_imgs, num_total_anchors, num_classes).
+            batch_bbox_pred (Tensor): Box energies / deltas for all image
+                level with shape (num_imgs, num_total_anchors, 4).
+            batch_anchor (Tensor): Box reference for all image with shape
+                (num_imgs, num_total_anchors, 4).
+            batch_labels (Tensor): Labels of all anchors with shape
+                (num_imgs, num_total_anchors,).
+            batch_label_weights (Tensor): Label weights of all anchor with
+                shape (num_imgs, num_total_anchors,)
+            batch_bbox_targets (Tensor): BBox regression targets of all anchor
+                weight shape (num_imgs, num_total_anchors, 4).
+            batch_bbox_weights (Tensor): BBox regression loss weights of
+                all anchor with shape (num_imgs, num_total_anchors, 4).
+            batch_pos_mask (Tensor): Positive samples mask in all images.
+            batch_neg_mask (Tensor): negative samples mask in all images.
+            num_total_samples (int): If sampling, num total samples equal to
+                the number of total anchors; Otherwise, it is the number of
+                positive anchors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_images, num_anchors, _ = batch_anchor.size()
+
+        batch_loss_cls_all = F.cross_entropy(
+            batch_cls_score.view((-1, self.cls_out_channels)),
+            batch_labels.view(-1),
+            reduction='none').view(
+                batch_label_weights.size()) * batch_label_weights
+        # # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        batch_num_pos_samples = torch.sum(batch_pos_mask, dim=1)
+        batch_num_neg_samples = \
+            self.train_cfg.neg_pos_ratio * batch_num_pos_samples
+
+        batch_num_neg_samples_max = torch.sum(batch_neg_mask, dim=1)
+        batch_num_neg_samples = torch.min(batch_num_neg_samples,
+                                          batch_num_neg_samples_max)
+
+        batch_topk_loss_cls_neg, _ = torch.topk(
+            batch_loss_cls_all * batch_neg_mask, k=num_anchors, dim=1)
+        batch_loss_cls_pos = torch.sum(
+            batch_loss_cls_all * batch_pos_mask, dim=1)
+
+        anchor_index = torch.arange(
+            end=num_anchors, dtype=torch.float,
+            device=batch_anchor.device).view((1, -1))
+        topk_loss_neg_mask = (anchor_index < batch_num_neg_samples.view(
+            -1, 1)).float()
+
+        batch_loss_cls_neg = torch.sum(
+            batch_topk_loss_cls_neg * topk_loss_neg_mask, dim=1)
+        loss_cls = \
+            (batch_loss_cls_pos + batch_loss_cls_neg) / num_total_samples
+
+        if self.reg_decoded_bbox:
+            # TODO: support self.reg_decoded_bbox is True
+            raise RuntimeError
+
+        loss_bbox_all = smooth_l1_loss(
+            batch_bbox_pred,
+            batch_bbox_targets,
+            batch_bbox_weights,
+            reduction='none',
+            beta=self.train_cfg.smoothl1_beta,
+            avg_factor=num_total_samples)
+        eps = torch.finfo(torch.float32).eps
+
+        sum_dim = (i for i in range(1, len(loss_bbox_all.size())))
+        loss_bbox = loss_bbox_all.sum(tuple(sum_dim)) / (
+            num_total_samples + eps)
+        return loss_cls[None], loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=1,
+            unmap_outputs=True,
+            return_level=False)
+        if cls_reg_targets is None:
+            return None
+
+        (batch_labels, batch_label_weights, batch_bbox_targets,
+         batch_bbox_weights, batch_pos_mask, batch_neg_mask, sampling_result,
+         num_total_pos, num_total_neg, batch_anchors) = cls_reg_targets
+
+        num_imgs = len(img_metas)
+        batch_cls_score = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.cls_out_channels)
+            for s in cls_scores
+        ], 1)
+
+        batch_bbox_pred = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) for b in bbox_preds
+        ], -2)
+
+        batch_losses_cls, batch_losses_bbox = self.batch_loss(
+            batch_cls_score, batch_bbox_pred, batch_anchors, batch_labels,
+            batch_label_weights, batch_bbox_targets, batch_bbox_weights,
+            batch_pos_mask, batch_neg_mask, num_total_pos)
+        losses_cls = [
+            batch_losses_cls[:, index_imgs] for index_imgs in range(num_imgs)
+        ]
+        losses_bbox = [losses_bbox for losses_bbox in batch_losses_bbox]
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/mmdet/models/dense_heads/atss_head.py b/mmdet/models/dense_heads/atss_head.py
new file mode 100755
index 0000000..e8f401c
--- /dev/null
+++ b/mmdet/models/dense_heads/atss_head.py
@@ -0,0 +1,501 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_assigner, build_sampler,
+                        images_to_levels, multi_apply, reduce_mean, unmap)
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class ATSSHead(AnchorHead):
+    """Bridging the Gap Between Anchor-based and Anchor-free Detection via
+    Adaptive Training Sample Selection.
+
+    ATSS head structure is similar with FCOS, however ATSS use anchor boxes
+    and assign label by Adaptive Training Sample Selection instead max-iou.
+
+    https://arxiv.org/abs/1912.02424
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 pred_kernel_size=3,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 reg_decoded_bbox=True,
+                 loss_centerness=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='atss_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        self.pred_kernel_size = pred_kernel_size
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(ATSSHead, self).__init__(
+            num_classes,
+            in_channels,
+            reg_decoded_bbox=reg_decoded_bbox,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # SSD sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.loss_centerness = build_loss(loss_centerness)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_centerness = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 1,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        return multi_apply(self.forward_single, feats, self.scales)
+
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        centerness = self.atss_centerness(reg_feat)
+        return cls_score, bbox_pred, centerness
+
+    def loss_single(self, anchors, cls_score, bbox_pred, centerness, labels,
+                    label_weights, bbox_targets, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            num_total_samples (int): Number os positive samples that is
+                reduced over all GPUs.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness,
+                centerness_targets,
+                avg_factor=num_total_samples)
+
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             centernesses,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos, dtype=torch.float,
+                         device=device)).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        losses_cls, losses_bbox, loss_centerness,\
+            bbox_avg_factor = multi_apply(
+                self.loss_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                num_total_samples=num_total_samples)
+
+        bbox_avg_factor = sum(bbox_avg_factor)
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def centerness_target(self, anchors, gts):
+        # only calculate pos centerness targets, otherwise there may be nan
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        assert not torch.isnan(centerness).any()
+        return centerness
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Get targets for ATSS head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, num_total_pos,
+                num_total_neg)
+
+    def _get_target_single(self,
+                           flat_anchors,
+                           valid_flags,
+                           num_level_anchors,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=1,
+                           unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors Tensor): Number of anchors of each scale level.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        assign_result = self.assigner.assign(anchors, num_level_anchors_inside,
+                                             gt_bboxes, gt_bboxes_ignore,
+                                             gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if self.reg_decoded_bbox:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            else:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/mmdet/models/dense_heads/autoassign_head.py b/mmdet/models/dense_heads/autoassign_head.py
new file mode 100755
index 0000000..446da24
--- /dev/null
+++ b/mmdet/models/dense_heads/autoassign_head.py
@@ -0,0 +1,527 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply
+from mmdet.core.anchor.point_generator import MlvlPointGenerator
+from mmdet.core.bbox import bbox_overlaps
+from mmdet.models import HEADS
+from mmdet.models.dense_heads.atss_head import reduce_mean
+from mmdet.models.dense_heads.fcos_head import FCOSHead
+from mmdet.models.dense_heads.paa_head import levels_to_images
+
+EPS = 1e-12
+
+
+class CenterPrior(nn.Module):
+    """Center Weighting module to adjust the category-specific prior
+    distributions.
+
+    Args:
+        force_topk (bool): When no point falls into gt_bbox, forcibly
+            select the k points closest to the center to calculate
+            the center prior. Defaults to False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        num_classes (int): The class number of dataset. Defaults to 80.
+        strides (tuple[int]): The stride of each input feature map. Defaults
+            to (8, 16, 32, 64, 128).
+    """
+
+    def __init__(self,
+                 force_topk=False,
+                 topk=9,
+                 num_classes=80,
+                 strides=(8, 16, 32, 64, 128)):
+        super(CenterPrior, self).__init__()
+        self.mean = nn.Parameter(torch.zeros(num_classes, 2))
+        self.sigma = nn.Parameter(torch.ones(num_classes, 2))
+        self.strides = strides
+        self.force_topk = force_topk
+        self.topk = topk
+
+    def forward(self, anchor_points_list, gt_bboxes, labels,
+                inside_gt_bbox_mask):
+        """Get the center prior of each point on the feature map for each
+        instance.
+
+        Args:
+            anchor_points_list (list[Tensor]): list of coordinate
+                of points on feature map. Each with shape
+                (num_points, 2).
+            gt_bboxes (Tensor): The gt_bboxes with shape of
+                (num_gt, 4).
+            labels (Tensor): The gt_labels with shape of (num_gt).
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple(Tensor):
+
+                - center_prior_weights(Tensor): Float tensor with shape \
+                    of (num_points, num_gt). Each value represents \
+                    the center weighting coefficient.
+                - inside_gt_bbox_mask (Tensor): Tensor of bool type, \
+                    with shape of (num_points, num_gt), each \
+                    value is used to mark whether this point falls \
+                    within a certain gt or is the topk nearest points for \
+                    a specific gt_bbox.
+        """
+        inside_gt_bbox_mask = inside_gt_bbox_mask.clone()
+        num_gts = len(labels)
+        num_points = sum([len(item) for item in anchor_points_list])
+        if num_gts == 0:
+            return gt_bboxes.new_zeros(num_points,
+                                       num_gts), inside_gt_bbox_mask
+        center_prior_list = []
+        for slvl_points, stride in zip(anchor_points_list, self.strides):
+            # slvl_points: points from single level in FPN, has shape (h*w, 2)
+            # single_level_points has shape (h*w, num_gt, 2)
+            single_level_points = slvl_points[:, None, :].expand(
+                (slvl_points.size(0), len(gt_bboxes), 2))
+            gt_center_x = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2)
+            gt_center_y = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2)
+            gt_center = torch.stack((gt_center_x, gt_center_y), dim=1)
+            gt_center = gt_center[None]
+            # instance_center has shape (1, num_gt, 2)
+            instance_center = self.mean[labels][None]
+            # instance_sigma has shape (1, num_gt, 2)
+            instance_sigma = self.sigma[labels][None]
+            # distance has shape (num_points, num_gt, 2)
+            distance = (((single_level_points - gt_center) / float(stride) -
+                         instance_center)**2)
+            center_prior = torch.exp(-distance /
+                                     (2 * instance_sigma**2)).prod(dim=-1)
+            center_prior_list.append(center_prior)
+        center_prior_weights = torch.cat(center_prior_list, dim=0)
+
+        if self.force_topk:
+            gt_inds_no_points_inside = torch.nonzero(
+                inside_gt_bbox_mask.sum(0) == 0).reshape(-1)
+            if gt_inds_no_points_inside.numel():
+                topk_center_index = \
+                    center_prior_weights[:, gt_inds_no_points_inside].topk(
+                                                             self.topk,
+                                                             dim=0)[1]
+                temp_mask = inside_gt_bbox_mask[:, gt_inds_no_points_inside]
+                inside_gt_bbox_mask[:, gt_inds_no_points_inside] = \
+                    torch.scatter(temp_mask,
+                                  dim=0,
+                                  index=topk_center_index,
+                                  src=torch.ones_like(
+                                    topk_center_index,
+                                    dtype=torch.bool))
+
+        center_prior_weights[~inside_gt_bbox_mask] = 0
+        return center_prior_weights, inside_gt_bbox_mask
+
+
+@HEADS.register_module()
+class AutoAssignHead(FCOSHead):
+    """AutoAssignHead head used in AutoAssign.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.03496>`_ .
+
+    Args:
+        force_topk (bool): Used in center prior initialization to
+            handle extremely small gt. Default is False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        pos_loss_weight (float): The loss weight of positive loss
+            and with default value 0.25.
+        neg_loss_weight (float): The loss weight of negative loss
+            and with default value 0.75.
+        center_loss_weight (float): The loss weight of center prior
+            loss and with default value 0.75.
+    """
+
+    def __init__(self,
+                 *args,
+                 force_topk=False,
+                 topk=9,
+                 pos_loss_weight=0.25,
+                 neg_loss_weight=0.75,
+                 center_loss_weight=0.75,
+                 **kwargs):
+        super().__init__(*args, conv_bias=True, **kwargs)
+        self.center_prior = CenterPrior(
+            force_topk=force_topk,
+            topk=topk,
+            num_classes=self.num_classes,
+            strides=self.strides)
+        self.pos_loss_weight = pos_loss_weight
+        self.neg_loss_weight = neg_loss_weight
+        self.center_loss_weight = center_loss_weight
+        self.prior_generator = MlvlPointGenerator(self.strides, offset=0)
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        In particular, we have special initialization for classified conv's and
+        regression conv's bias
+        """
+
+        super(AutoAssignHead, self).init_weights()
+        bias_cls = bias_init_with_prob(0.02)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        normal_init(self.conv_reg, std=0.01, bias=4.0)
+
+    def forward_single(self, x, scale, stride):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions and centerness \
+                predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super(
+            FCOSHead, self).forward_single(x)
+        centerness = self.conv_centerness(reg_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        bbox_pred *= stride
+        return cls_score, bbox_pred, centerness
+
+    def get_pos_loss_single(self, cls_score, objectness, reg_loss, gt_labels,
+                            center_prior_weights):
+        """Calculate the positive loss of all points in gt_bboxes.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points,
+                has shape (num_points, 1).
+            reg_loss (Tensor): The regression loss of each gt_bbox and each
+                prediction box, has shape of (num_points, num_gt).
+            gt_labels (Tensor): The zeros based gt_labels of all gt
+                with shape of (num_gt,).
+            center_prior_weights (Tensor): Float tensor with shape
+                of (num_points, num_gt). Each value represents
+                the center weighting coefficient.
+
+        Returns:
+            tuple[Tensor]:
+
+                - pos_loss (Tensor): The positive loss of all points
+                  in the gt_bboxes.
+        """
+        # p_loc: localization confidence
+        p_loc = torch.exp(-reg_loss)
+        # p_cls: classification confidence
+        p_cls = (cls_score * objectness)[:, gt_labels]
+        # p_pos: joint confidence indicator
+        p_pos = p_cls * p_loc
+
+        # 3 is a hyper-parameter to control the contributions of high and
+        # low confidence locations towards positive losses.
+        confidence_weight = torch.exp(p_pos * 3)
+        p_pos_weight = (confidence_weight * center_prior_weights) / (
+            (confidence_weight * center_prior_weights).sum(
+                0, keepdim=True)).clamp(min=EPS)
+        reweighted_p_pos = (p_pos * p_pos_weight).sum(0)
+        pos_loss = F.binary_cross_entropy(
+            reweighted_p_pos,
+            torch.ones_like(reweighted_p_pos),
+            reduction='none')
+        pos_loss = pos_loss.sum() * self.pos_loss_weight
+        return pos_loss,
+
+    def get_neg_loss_single(self, cls_score, objectness, gt_labels, ious,
+                            inside_gt_bbox_mask):
+        """Calculate the negative loss of all points in feature map.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points
+                and is shape of (num_points, 1).
+            gt_labels (Tensor): The zeros based label of all gt with shape of
+                (num_gt).
+            ious (Tensor): Float tensor with shape of (num_points, num_gt).
+                Each value represent the iou of pred_bbox and gt_bboxes.
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple[Tensor]:
+
+                - neg_loss (Tensor): The negative loss of all points
+                  in the feature map.
+        """
+        num_gts = len(gt_labels)
+        joint_conf = (cls_score * objectness)
+        p_neg_weight = torch.ones_like(joint_conf)
+        if num_gts > 0:
+            # the order of dinmension would affect the value of
+            # p_neg_weight, we strictly follow the original
+            # implementation.
+            inside_gt_bbox_mask = inside_gt_bbox_mask.permute(1, 0)
+            ious = ious.permute(1, 0)
+
+            foreground_idxs = torch.nonzero(inside_gt_bbox_mask, as_tuple=True)
+            temp_weight = (1 / (1 - ious[foreground_idxs]).clamp_(EPS))
+
+            def normalize(x):
+                return (x - x.min() + EPS) / (x.max() - x.min() + EPS)
+
+            for instance_idx in range(num_gts):
+                idxs = foreground_idxs[0] == instance_idx
+                if idxs.any():
+                    temp_weight[idxs] = normalize(temp_weight[idxs])
+
+            p_neg_weight[foreground_idxs[1],
+                         gt_labels[foreground_idxs[0]]] = 1 - temp_weight
+
+        logits = (joint_conf * p_neg_weight)
+        neg_loss = (
+            logits**2 * F.binary_cross_entropy(
+                logits, torch.zeros_like(logits), reduction='none'))
+        neg_loss = neg_loss.sum() * self.neg_loss_weight
+        return neg_loss,
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'objectnesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             objectnesses,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            objectnesses (list[Tensor]): objectness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        all_num_gt = sum([len(item) for item in gt_bboxes])
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        inside_gt_bbox_mask_list, bbox_targets_list = self.get_targets(
+            all_level_points, gt_bboxes)
+
+        center_prior_weight_list = []
+        temp_inside_gt_bbox_mask_list = []
+        for gt_bboxe, gt_label, inside_gt_bbox_mask in zip(
+                gt_bboxes, gt_labels, inside_gt_bbox_mask_list):
+            center_prior_weight, inside_gt_bbox_mask = \
+                self.center_prior(all_level_points, gt_bboxe, gt_label,
+                                  inside_gt_bbox_mask)
+            center_prior_weight_list.append(center_prior_weight)
+            temp_inside_gt_bbox_mask_list.append(inside_gt_bbox_mask)
+        inside_gt_bbox_mask_list = temp_inside_gt_bbox_mask_list
+        mlvl_points = torch.cat(all_level_points, dim=0)
+        bbox_preds = levels_to_images(bbox_preds)
+        cls_scores = levels_to_images(cls_scores)
+        objectnesses = levels_to_images(objectnesses)
+
+        reg_loss_list = []
+        ious_list = []
+        num_points = len(mlvl_points)
+
+        for bbox_pred, encoded_targets, inside_gt_bbox_mask in zip(
+                bbox_preds, bbox_targets_list, inside_gt_bbox_mask_list):
+            temp_num_gt = encoded_targets.size(1)
+            expand_mlvl_points = mlvl_points[:, None, :].expand(
+                num_points, temp_num_gt, 2).reshape(-1, 2)
+            encoded_targets = encoded_targets.reshape(-1, 4)
+            expand_bbox_pred = bbox_pred[:, None, :].expand(
+                num_points, temp_num_gt, 4).reshape(-1, 4)
+            decoded_bbox_preds = self.bbox_coder.decode(
+                expand_mlvl_points, expand_bbox_pred)
+            decoded_target_preds = self.bbox_coder.decode(
+                expand_mlvl_points, encoded_targets)
+            with torch.no_grad():
+                ious = bbox_overlaps(
+                    decoded_bbox_preds, decoded_target_preds, is_aligned=True)
+                ious = ious.reshape(num_points, temp_num_gt)
+                if temp_num_gt:
+                    ious = ious.max(
+                        dim=-1, keepdim=True).values.repeat(1, temp_num_gt)
+                else:
+                    ious = ious.new_zeros(num_points, temp_num_gt)
+                ious[~inside_gt_bbox_mask] = 0
+                ious_list.append(ious)
+            loss_bbox = self.loss_bbox(
+                decoded_bbox_preds,
+                decoded_target_preds,
+                weight=None,
+                reduction_override='none')
+            reg_loss_list.append(loss_bbox.reshape(num_points, temp_num_gt))
+
+        cls_scores = [item.sigmoid() for item in cls_scores]
+        objectnesses = [item.sigmoid() for item in objectnesses]
+        pos_loss_list, = multi_apply(self.get_pos_loss_single, cls_scores,
+                                     objectnesses, reg_loss_list, gt_labels,
+                                     center_prior_weight_list)
+        pos_avg_factor = reduce_mean(
+            bbox_pred.new_tensor(all_num_gt)).clamp_(min=1)
+        pos_loss = sum(pos_loss_list) / pos_avg_factor
+
+        neg_loss_list, = multi_apply(self.get_neg_loss_single, cls_scores,
+                                     objectnesses, gt_labels, ious_list,
+                                     inside_gt_bbox_mask_list)
+        neg_avg_factor = sum(item.data.sum()
+                             for item in center_prior_weight_list)
+        neg_avg_factor = reduce_mean(neg_avg_factor).clamp_(min=1)
+        neg_loss = sum(neg_loss_list) / neg_avg_factor
+
+        center_loss = []
+        for i in range(len(img_metas)):
+
+            if inside_gt_bbox_mask_list[i].any():
+                center_loss.append(
+                    len(gt_bboxes[i]) /
+                    center_prior_weight_list[i].sum().clamp_(min=EPS))
+            # when width or height of gt_bbox is smaller than stride of p3
+            else:
+                center_loss.append(center_prior_weight_list[i].sum() * 0)
+
+        center_loss = torch.stack(center_loss).mean() * self.center_loss_weight
+
+        # avoid dead lock in DDP
+        if all_num_gt == 0:
+            pos_loss = bbox_preds[0].sum() * 0
+            dummy_center_prior_loss = self.center_prior.mean.sum(
+            ) * 0 + self.center_prior.sigma.sum() * 0
+            center_loss = objectnesses[0].sum() * 0 + dummy_center_prior_loss
+
+        loss = dict(
+            loss_pos=pos_loss, loss_neg=neg_loss, loss_center=center_loss)
+
+        return loss
+
+    def get_targets(self, points, gt_bboxes_list):
+        """Compute regression targets and each point inside or outside gt_bbox
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of all fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+
+        Returns:
+            tuple(list[Tensor]):
+
+                - inside_gt_bbox_mask_list (list[Tensor]): Each
+                  Tensor is with bool type and shape of
+                  (num_points, num_gt), each value
+                  is used to mark whether this point falls
+                  within a certain gt.
+                - concat_lvl_bbox_targets (list[Tensor]): BBox
+                  targets of each level. Each tensor has shape
+                  (num_points, num_gt, 4).
+        """
+
+        concat_points = torch.cat(points, dim=0)
+        # the number of points per img, per lvl
+        inside_gt_bbox_mask_list, bbox_targets_list = multi_apply(
+            self._get_target_single, gt_bboxes_list, points=concat_points)
+        return inside_gt_bbox_mask_list, bbox_targets_list
+
+    def _get_target_single(self, gt_bboxes, points):
+        """Compute regression targets and each point inside or outside gt_bbox
+        for a single image.
+
+        Args:
+            gt_bboxes (Tensor): gt_bbox of single image, has shape
+                (num_gt, 4).
+            points (Tensor): Points of all fpn level, has shape
+                (num_points, 2).
+
+        Returns:
+            tuple[Tensor]: Containing the following Tensors:
+
+                - inside_gt_bbox_mask (Tensor): Bool tensor with shape
+                  (num_points, num_gt), each value is used to mark
+                  whether this point falls within a certain gt.
+                - bbox_targets (Tensor): BBox targets of each points with
+                  each gt_bboxes, has shape (num_points, num_gt, 4).
+        """
+        num_points = points.size(0)
+        num_gts = gt_bboxes.size(0)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None]
+        ys = ys[:, None]
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+        if num_gts:
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+        else:
+            inside_gt_bbox_mask = bbox_targets.new_zeros((num_points, num_gts),
+                                                         dtype=torch.bool)
+
+        return inside_gt_bbox_mask, bbox_targets
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Almost the same as the implementation in fcos, we remove half stride
+        offset to align with the original implementation.
+
+        This function will be deprecated soon.
+        """
+        warnings.warn(
+            '`_get_points_single` in `AutoAssignHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of a single level feature map '
+            'with `self.prior_generator.single_level_grid_priors` ')
+        y, x = super(FCOSHead,
+                     self)._get_points_single(featmap_size, stride, dtype,
+                                              device)
+        points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
+                             dim=-1)
+        return points
diff --git a/mmdet/models/dense_heads/base_dense_head.py b/mmdet/models/dense_heads/base_dense_head.py
new file mode 100755
index 0000000..0c7abb7
--- /dev/null
+++ b/mmdet/models/dense_heads/base_dense_head.py
@@ -0,0 +1,526 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+from mmcv.cnn.utils.weight_init import constant_init
+from mmcv.ops import batched_nms
+from mmcv.runner import BaseModule, force_fp32
+
+from mmdet.core.utils import filter_scores_and_topk, select_single_mlvl
+
+
+class BaseDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for DenseHeads."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseDenseHead, self).__init__(init_cfg)
+
+    def init_weights(self):
+        super(BaseDenseHead, self).init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, 'conv_offset'):
+                constant_init(m.conv_offset, 0)
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Compute losses of the head."""
+        pass
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   score_factors=None,
+                   img_metas=None,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True,
+                   **kwargs):
+        """Transform network outputs of a batch into bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Default None.
+            img_metas (list[dict], Optional): Image meta info. Default None.
+            cfg (mmcv.Config, Optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.  Default None.
+            rescale (bool): If True, return boxes in original image space.
+                Default False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default True.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of
+                the corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device)
+
+        result_list = []
+
+        for img_id in range(len(img_metas)):
+            img_meta = img_metas[img_id]
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(score_factors, img_id)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                              score_factor_list, mlvl_priors,
+                                              img_meta, cfg, rescale, with_nms,
+                                              **kwargs)
+            result_list.append(results)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_priors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            bboxes = self.bbox_coder.decode(
+                priors, bbox_pred, max_shape=img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        return self._bbox_post_process(mlvl_scores, mlvl_labels, mlvl_bboxes,
+                                       img_meta['scale_factor'], cfg, rescale,
+                                       with_nms, mlvl_score_factors, **kwargs)
+
+    def _bbox_post_process(self,
+                           mlvl_scores,
+                           mlvl_labels,
+                           mlvl_bboxes,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           mlvl_score_factors=None,
+                           **kwargs):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            mlvl_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_labels (list[Tensor]): Box class labels from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale
+                levels of a single image, each item has shape (num_bboxes, 4).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            mlvl_score_factors (list[Tensor], optional): Score factor from
+                all scale levels of a single image, each item has shape
+                (num_bboxes, ). Default: None.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        assert len(mlvl_scores) == len(mlvl_bboxes) == len(mlvl_labels)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_labels = torch.cat(mlvl_labels)
+
+        if mlvl_score_factors is not None:
+            # TODO： Add sqrt operation in order to be consistent with
+            #  the paper.
+            mlvl_score_factors = torch.cat(mlvl_score_factors)
+            mlvl_scores = mlvl_scores * mlvl_score_factors
+
+        if with_nms:
+            if mlvl_bboxes.numel() == 0:
+                det_bboxes = torch.cat([mlvl_bboxes, mlvl_scores[:, None]], -1)
+                return det_bboxes, mlvl_labels
+
+            det_bboxes, keep_idxs = batched_nms(mlvl_bboxes, mlvl_scores,
+                                                mlvl_labels, cfg.nms)
+            det_bboxes = det_bboxes[:cfg.max_per_img]
+            det_labels = mlvl_labels[keep_idxs][:cfg.max_per_img]
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores, mlvl_labels
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(
+                *outs, img_metas=img_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+
+    def simple_test(self, feats, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n, ).
+        """
+        return self.simple_test_bboxes(feats, img_metas, rescale=rescale)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def onnx_export(self,
+                    cls_scores,
+                    bbox_preds,
+                    score_factors=None,
+                    img_metas=None,
+                    with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            score_factors (list[Tensor]): score_factors for each s
+                cale level with shape (N, num_points * 1, H, W).
+                Default: None.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc. Default: None.
+            with_nms (bool): Whether apply nms to the bboxes. Default: True.
+
+        Returns:
+            tuple[Tensor, Tensor] | list[tuple]: When `with_nms` is True,
+            it is tuple[Tensor, Tensor], first tensor bboxes with shape
+            [N, num_det, 5], 5 arrange as (x1, y1, x2, y2, score)
+            and second element is class labels of shape [N, num_det].
+            When `with_nms` is False, first tensor is bboxes with
+            shape [N, num_det, 4], second tensor is raw score has
+            shape  [N, num_det, num_classes].
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+
+        mlvl_cls_scores = [cls_scores[i].detach() for i in range(num_levels)]
+        mlvl_bbox_preds = [bbox_preds[i].detach() for i in range(num_levels)]
+
+        assert len(
+            img_metas
+        ) == 1, 'Only support one input image while in exporting to ONNX'
+        img_shape = img_metas[0]['img_shape_for_onnx']
+
+        cfg = self.test_cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_priors)
+        device = cls_scores[0].device
+        batch_size = cls_scores[0].shape[0]
+        # convert to tensor to keep tracing
+        nms_pre_tensor = torch.tensor(
+            cfg.get('nms_pre', -1), device=device, dtype=torch.long)
+
+        # e.g. Retina, FreeAnchor, etc.
+        if score_factors is None:
+            with_score_factors = False
+            mlvl_score_factor = [None for _ in range(num_levels)]
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+            mlvl_score_factor = [
+                score_factors[i].detach() for i in range(num_levels)
+            ]
+            mlvl_score_factors = []
+
+        mlvl_batch_bboxes = []
+        mlvl_scores = []
+
+        for cls_score, bbox_pred, score_factors, priors in zip(
+                mlvl_cls_scores, mlvl_bbox_preds, mlvl_score_factor,
+                mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(0, 2, 3,
+                                       1).reshape(batch_size, -1,
+                                                  self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = scores.sigmoid()
+                nms_pre_score = scores
+            else:
+                scores = scores.softmax(-1)
+                nms_pre_score = scores
+
+            if with_score_factors:
+                score_factors = score_factors.permute(0, 2, 3, 1).reshape(
+                    batch_size, -1).sigmoid()
+            bbox_pred = bbox_pred.permute(0, 2, 3,
+                                          1).reshape(batch_size, -1, 4)
+            priors = priors.expand(batch_size, -1, priors.size(-1))
+            # Get top-k predictions
+            from mmdet.core.export import get_k_for_topk
+            nms_pre = get_k_for_topk(nms_pre_tensor, bbox_pred.shape[1])
+            if nms_pre > 0:
+
+                if with_score_factors:
+                    nms_pre_score = (nms_pre_score * score_factors[..., None])
+                else:
+                    nms_pre_score = nms_pre_score
+
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = nms_pre_score.max(-1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = nms_pre_score[..., :-1].max(-1)
+                _, topk_inds = max_scores.topk(nms_pre)
+
+                batch_inds = torch.arange(
+                    batch_size, device=bbox_pred.device).view(
+                        -1, 1).expand_as(topk_inds).long()
+                # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
+                transformed_inds = bbox_pred.shape[1] * batch_inds + topk_inds
+                priors = priors.reshape(
+                    -1, priors.size(-1))[transformed_inds, :].reshape(
+                        batch_size, -1, priors.size(-1))
+                bbox_pred = bbox_pred.reshape(-1,
+                                              4)[transformed_inds, :].reshape(
+                                                  batch_size, -1, 4)
+                scores = scores.reshape(
+                    -1, self.cls_out_channels)[transformed_inds, :].reshape(
+                        batch_size, -1, self.cls_out_channels)
+                if with_score_factors:
+                    score_factors = score_factors.reshape(
+                        -1, 1)[transformed_inds].reshape(batch_size, -1)
+
+            bboxes = self.bbox_coder.decode(
+                priors, bbox_pred, max_shape=img_shape)
+
+            mlvl_batch_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            if with_score_factors:
+                mlvl_score_factors.append(score_factors)
+
+        batch_bboxes = torch.cat(mlvl_batch_bboxes, dim=1)
+        batch_scores = torch.cat(mlvl_scores, dim=1)
+        if with_score_factors:
+            batch_score_factors = torch.cat(mlvl_score_factors, dim=1)
+
+        # Replace multiclass_nms with ONNX::NonMaxSuppression in deployment
+
+        from mmdet.core.export import add_dummy_nms_for_onnx
+
+        if not self.use_sigmoid_cls:
+            batch_scores = batch_scores[..., :self.num_classes]
+
+        if with_score_factors:
+            batch_scores = batch_scores * (batch_score_factors.unsqueeze(2))
+
+        if with_nms:
+            max_output_boxes_per_class = cfg.nms.get(
+                'max_output_boxes_per_class', 200)
+            iou_threshold = cfg.nms.get('iou_threshold', 0.5)
+            score_threshold = cfg.score_thr
+            nms_pre = cfg.get('deploy_nms_pre', -1)
+            return add_dummy_nms_for_onnx(batch_bboxes, batch_scores,
+                                          max_output_boxes_per_class,
+                                          iou_threshold, score_threshold,
+                                          nms_pre, cfg.max_per_img)
+        else:
+            return batch_bboxes, batch_scores
diff --git a/mmdet/models/dense_heads/base_mask_head.py b/mmdet/models/dense_heads/base_mask_head.py
new file mode 100755
index 0000000..5eb94fb
--- /dev/null
+++ b/mmdet/models/dense_heads/base_mask_head.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+
+
+class BaseMaskHead(BaseModule, metaclass=ABCMeta):
+    """Base class for mask heads used in One-Stage Instance Segmentation."""
+
+    def __init__(self, init_cfg):
+        super(BaseMaskHead, self).__init__(init_cfg)
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def get_results(self, **kwargs):
+        """Get precessed :obj:`InstanceData` of multiple images."""
+        pass
+
+    def forward_train(self,
+                      x,
+                      gt_labels,
+                      gt_masks,
+                      img_metas,
+                      gt_bboxes=None,
+                      gt_bboxes_ignore=None,
+                      positive_infos=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor] | tuple[Tensor]): Features from FPN.
+                Each has a shape (B, C, H, W).
+            gt_labels (list[Tensor]): Ground truth labels of all images.
+                each has a shape (num_gts,).
+            gt_masks (list[Tensor]) : Masks for each bbox, has a shape
+                (num_gts, h , w).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
+                each item has a shape (num_gts, 4).
+            gt_bboxes_ignore (list[Tensor], None): Ground truth bboxes to be
+                ignored, each item has a shape (num_ignored_gts, 4).
+            positive_infos (list[:obj:`InstanceData`], optional): Information
+                of positive samples. Used when the label assignment is
+                done outside the MaskHead, e.g., in BboxHead in
+                YOLACT or CondInst, etc. When the label assignment is done in
+                MaskHead, it would be None, like SOLO. All values
+                in it should have shape (num_positive_samples, *).
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        if positive_infos is None:
+            outs = self(x)
+        else:
+            outs = self(x, positive_infos)
+
+        assert isinstance(outs, tuple), 'Forward results should be a tuple, ' \
+                                        'even if only one item is returned'
+        loss = self.loss(
+            *outs,
+            gt_labels=gt_labels,
+            gt_masks=gt_masks,
+            img_metas=img_metas,
+            gt_bboxes=gt_bboxes,
+            gt_bboxes_ignore=gt_bboxes_ignore,
+            positive_infos=positive_infos,
+            **kwargs)
+        return loss
+
+    def simple_test(self,
+                    feats,
+                    img_metas,
+                    rescale=False,
+                    instances_list=None,
+                    **kwargs):
+        """Test function without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+            instances_list (list[obj:`InstanceData`], optional): Detection
+                results of each image after the post process. Only exist
+                if there is a `bbox_head`, like `YOLACT`, `CondInst`, etc.
+
+        Returns:
+            list[obj:`InstanceData`]: Instance segmentation \
+                results of each image after the post process. \
+                Each item usually contains following keys. \
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Has a shape (num_instances,).
+                - masks (Tensor): Processed mask results, has a
+                  shape (num_instances, h, w).
+        """
+        if instances_list is None:
+            outs = self(feats)
+        else:
+            outs = self(feats, instances_list=instances_list)
+        mask_inputs = outs + (img_metas, )
+        results_list = self.get_results(
+            *mask_inputs,
+            rescale=rescale,
+            instances_list=instances_list,
+            **kwargs)
+        return results_list
+
+    def onnx_export(self, img, img_metas):
+        raise NotImplementedError(f'{self.__class__.__name__} does '
+                                  f'not support ONNX EXPORT')
diff --git a/mmdet/models/dense_heads/cascade_rpn_head.py b/mmdet/models/dense_heads/cascade_rpn_head.py
new file mode 100755
index 0000000..69347e0
--- /dev/null
+++ b/mmdet/models/dense_heads/cascade_rpn_head.py
@@ -0,0 +1,801 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+import copy
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv import ConfigDict
+from mmcv.ops import DeformConv2d, batched_nms
+from mmcv.runner import BaseModule, ModuleList
+
+from mmdet.core import (RegionAssigner, build_assigner, build_sampler,
+                        images_to_levels, multi_apply)
+from mmdet.core.utils import select_single_mlvl
+from ..builder import HEADS, build_head
+from .base_dense_head import BaseDenseHead
+from .rpn_head import RPNHead
+
+
+class AdaptiveConv(BaseModule):
+    """AdaptiveConv used to adapt the sampling location with the anchors.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the conv kernel. Default: 3
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 1
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 3
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If set True, adds a learnable bias to the
+            output. Default: False.
+        type (str, optional): Type of adaptive conv, can be either 'offset'
+            (arbitrary anchors) or 'dilation' (uniform anchor).
+            Default: 'dilation'.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=3,
+                 groups=1,
+                 bias=False,
+                 type='dilation',
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv'))):
+        super(AdaptiveConv, self).__init__(init_cfg)
+        assert type in ['offset', 'dilation']
+        self.adapt_type = type
+
+        assert kernel_size == 3, 'Adaptive conv only supports kernels 3'
+        if self.adapt_type == 'offset':
+            assert stride == 1 and padding == 1 and groups == 1, \
+                'Adaptive conv offset mode only supports padding: {1}, ' \
+                f'stride: {1}, groups: {1}'
+            self.conv = DeformConv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=padding,
+                stride=stride,
+                groups=groups,
+                bias=bias)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=dilation,
+                dilation=dilation)
+
+    def forward(self, x, offset):
+        """Forward function."""
+        if self.adapt_type == 'offset':
+            N, _, H, W = x.shape
+            assert offset is not None
+            assert H * W == offset.shape[1]
+            # reshape [N, NA, 18] to (N, 18, H, W)
+            offset = offset.permute(0, 2, 1).reshape(N, -1, H, W)
+            offset = offset.contiguous()
+            x = self.conv(x, offset)
+        else:
+            assert offset is None
+            x = self.conv(x)
+        return x
+
+
+@HEADS.register_module()
+class StageCascadeRPNHead(RPNHead):
+    """Stage of CascadeRPNHead.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (dict): anchor generator config.
+        adapt_cfg (dict): adaptation config.
+        bridged_feature (bool, optional): whether update rpn feature.
+            Default: False.
+        with_cls (bool, optional): whether use classification branch.
+            Default: True.
+        sampling (bool, optional): whether use sampling. Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     scales=[8],
+                     ratios=[1.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 adapt_cfg=dict(type='dilation', dilation=3),
+                 bridged_feature=False,
+                 with_cls=True,
+                 sampling=True,
+                 init_cfg=None,
+                 **kwargs):
+        self.with_cls = with_cls
+        self.anchor_strides = anchor_generator['strides']
+        self.anchor_scales = anchor_generator['scales']
+        self.bridged_feature = bridged_feature
+        self.adapt_cfg = adapt_cfg
+        super(StageCascadeRPNHead, self).__init__(
+            in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        # override sampling and sampler
+        self.sampling = sampling
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Normal', std=0.01, override=[dict(name='rpn_reg')])
+            if self.with_cls:
+                self.init_cfg['override'].append(dict(name='rpn_cls'))
+
+    def _init_layers(self):
+        """Init layers of a CascadeRPN stage."""
+        self.rpn_conv = AdaptiveConv(self.in_channels, self.feat_channels,
+                                     **self.adapt_cfg)
+        if self.with_cls:
+            self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                     self.num_anchors * self.cls_out_channels,
+                                     1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward_single(self, x, offset):
+        """Forward function of single scale."""
+        bridged_x = x
+        x = self.relu(self.rpn_conv(x, offset))
+        if self.bridged_feature:
+            bridged_x = x  # update feature
+        cls_score = self.rpn_cls(x) if self.with_cls else None
+        bbox_pred = self.rpn_reg(x)
+        return bridged_x, cls_score, bbox_pred
+
+    def forward(self, feats, offset_list=None):
+        """Forward function."""
+        if offset_list is None:
+            offset_list = [None for _ in range(len(feats))]
+        return multi_apply(self.forward_single, feats, offset_list)
+
+    def _region_targets_single(self,
+                               anchors,
+                               valid_flags,
+                               gt_bboxes,
+                               gt_bboxes_ignore,
+                               gt_labels,
+                               img_meta,
+                               featmap_sizes,
+                               label_channels=1):
+        """Get anchor targets based on region for single level."""
+        assign_result = self.assigner.assign(
+            anchors,
+            valid_flags,
+            gt_bboxes,
+            img_meta,
+            featmap_sizes,
+            self.anchor_scales[0],
+            self.anchor_strides,
+            gt_bboxes_ignore=gt_bboxes_ignore,
+            gt_labels=None,
+            allowed_border=self.train_cfg.allowed_border)
+        flat_anchors = torch.cat(anchors)
+        sampling_result = self.sampler.sample(assign_result, flat_anchors,
+                                              gt_bboxes)
+
+        num_anchors = flat_anchors.shape[0]
+        bbox_targets = torch.zeros_like(flat_anchors)
+        bbox_weights = torch.zeros_like(flat_anchors)
+        labels = flat_anchors.new_zeros(num_anchors, dtype=torch.long)
+        label_weights = flat_anchors.new_zeros(num_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                labels[pos_inds] = 1
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def region_targets(self,
+                       anchor_list,
+                       valid_flag_list,
+                       gt_bboxes_list,
+                       img_metas,
+                       featmap_sizes,
+                       gt_bboxes_ignore_list=None,
+                       gt_labels_list=None,
+                       label_channels=1,
+                       unmap_outputs=True):
+        """See :func:`StageCascadeRPNHead.get_targets`."""
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list) = multi_apply(
+             self._region_targets_single,
+             anchor_list,
+             valid_flag_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             featmap_sizes=featmap_sizes,
+             label_channels=label_channels)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes,
+                    img_metas,
+                    featmap_sizes,
+                    gt_bboxes_ignore=None,
+                    label_channels=1):
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list]): Multi level anchors of each image.
+            valid_flag_list (list[list]): Multi level valid flags of each
+                image.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            featmap_sizes (list[Tensor]): Feature mapsize each level
+            gt_bboxes_ignore (list[Tensor]): Ignore bboxes of each images
+            label_channels (int): Channel of label.
+
+        Returns:
+            cls_reg_targets (tuple)
+        """
+        if isinstance(self.assigner, RegionAssigner):
+            cls_reg_targets = self.region_targets(
+                anchor_list,
+                valid_flag_list,
+                gt_bboxes,
+                img_metas,
+                featmap_sizes,
+                gt_bboxes_ignore_list=gt_bboxes_ignore,
+                label_channels=label_channels)
+        else:
+            cls_reg_targets = super(StageCascadeRPNHead, self).get_targets(
+                anchor_list,
+                valid_flag_list,
+                gt_bboxes,
+                img_metas,
+                gt_bboxes_ignore_list=gt_bboxes_ignore,
+                label_channels=label_channels)
+        return cls_reg_targets
+
+    def anchor_offset(self, anchor_list, anchor_strides, featmap_sizes):
+        """ Get offset for deformable conv based on anchor shape
+        NOTE: currently support deformable kernel_size=3 and dilation=1
+
+        Args:
+            anchor_list (list[list[tensor])): [NI, NLVL, NA, 4] list of
+                multi-level anchors
+            anchor_strides (list[int]): anchor stride of each level
+
+        Returns:
+            offset_list (list[tensor]): [NLVL, NA, 2, 18]: offset of DeformConv
+                kernel.
+        """
+
+        def _shape_offset(anchors, stride, ks=3, dilation=1):
+            # currently support kernel_size=3 and dilation=1
+            assert ks == 3 and dilation == 1
+            pad = (ks - 1) // 2
+            idx = torch.arange(-pad, pad + 1, dtype=dtype, device=device)
+            yy, xx = torch.meshgrid(idx, idx)  # return order matters
+            xx = xx.reshape(-1)
+            yy = yy.reshape(-1)
+            w = (anchors[:, 2] - anchors[:, 0]) / stride
+            h = (anchors[:, 3] - anchors[:, 1]) / stride
+            w = w / (ks - 1) - dilation
+            h = h / (ks - 1) - dilation
+            offset_x = w[:, None] * xx  # (NA, ks**2)
+            offset_y = h[:, None] * yy  # (NA, ks**2)
+            return offset_x, offset_y
+
+        def _ctr_offset(anchors, stride, featmap_size):
+            feat_h, feat_w = featmap_size
+            assert len(anchors) == feat_h * feat_w
+
+            x = (anchors[:, 0] + anchors[:, 2]) * 0.5
+            y = (anchors[:, 1] + anchors[:, 3]) * 0.5
+            # compute centers on feature map
+            x = x / stride
+            y = y / stride
+            # compute predefine centers
+            xx = torch.arange(0, feat_w, device=anchors.device)
+            yy = torch.arange(0, feat_h, device=anchors.device)
+            yy, xx = torch.meshgrid(yy, xx)
+            xx = xx.reshape(-1).type_as(x)
+            yy = yy.reshape(-1).type_as(y)
+
+            offset_x = x - xx  # (NA, )
+            offset_y = y - yy  # (NA, )
+            return offset_x, offset_y
+
+        num_imgs = len(anchor_list)
+        num_lvls = len(anchor_list[0])
+        dtype = anchor_list[0][0].dtype
+        device = anchor_list[0][0].device
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        offset_list = []
+        for i in range(num_imgs):
+            mlvl_offset = []
+            for lvl in range(num_lvls):
+                c_offset_x, c_offset_y = _ctr_offset(anchor_list[i][lvl],
+                                                     anchor_strides[lvl],
+                                                     featmap_sizes[lvl])
+                s_offset_x, s_offset_y = _shape_offset(anchor_list[i][lvl],
+                                                       anchor_strides[lvl])
+
+                # offset = ctr_offset + shape_offset
+                offset_x = s_offset_x + c_offset_x[:, None]
+                offset_y = s_offset_y + c_offset_y[:, None]
+
+                # offset order (y0, x0, y1, x2, .., y8, x8, y9, x9)
+                offset = torch.stack([offset_y, offset_x], dim=-1)
+                offset = offset.reshape(offset.size(0), -1)  # [NA, 2*ks**2]
+                mlvl_offset.append(offset)
+            offset_list.append(torch.cat(mlvl_offset))  # [totalNA, 2*ks**2]
+        offset_list = images_to_levels(offset_list, num_level_anchors)
+        return offset_list
+
+    def loss_single(self, cls_score, bbox_pred, anchors, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples):
+        """Loss function on single scale."""
+        # classification loss
+        if self.with_cls:
+            labels = labels.reshape(-1)
+            label_weights = label_weights.reshape(-1)
+            cls_score = cls_score.permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+            loss_cls = self.loss_cls(
+                cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, 4)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_reg = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+        if self.with_cls:
+            return loss_cls, loss_reg
+        return None, loss_reg
+
+    def loss(self,
+             anchor_list,
+             valid_flag_list,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            anchor_list (list[list]): Multi level anchors of each image.
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            featmap_sizes,
+            gt_bboxes_ignore=gt_bboxes_ignore,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        if self.sampling:
+            num_total_samples = num_total_pos + num_total_neg
+        else:
+            # 200 is hard-coded average factor,
+            # which follows guided anchoring.
+            num_total_samples = sum([label.numel()
+                                     for label in labels_list]) / 200.0
+
+        # change per image, per level anchor_list to per_level, per_image
+        mlvl_anchor_list = list(zip(*anchor_list))
+        # concat mlvl_anchor_list
+        mlvl_anchor_list = [
+            torch.cat(anchors, dim=0) for anchors in mlvl_anchor_list
+        ]
+
+        losses = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            mlvl_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+        if self.with_cls:
+            return dict(loss_rpn_cls=losses[0], loss_rpn_reg=losses[1])
+        return dict(loss_rpn_reg=losses[1])
+
+    def get_bboxes(self,
+                   anchor_list,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg,
+                   rescale=False):
+        """Get proposal predict.
+
+        Args:
+            anchor_list (list[list]): Multi level anchors of each image.
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            img_metas (list[dict], Optional): Image meta info. Default None.
+            cfg (mmcv.Config, Optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+
+        Returns:
+            Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                                anchor_list[img_id], img_shape,
+                                                scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has
+                shape (num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Box reference from all scale
+                levels of a single image, each item has shape
+                (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default False.
+
+        Returns:
+            Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        nms_pre = cfg.get('nms_pre', -1)
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(dim=1)[:, 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            anchors = mlvl_anchors[idx]
+
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ), idx, dtype=torch.long))
+
+        scores = torch.cat(mlvl_scores)
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
+        proposals = self.bbox_coder.decode(
+            anchors, rpn_bbox_pred, max_shape=img_shape)
+        ids = torch.cat(level_ids)
+
+        if cfg.min_bbox_size >= 0:
+            w = proposals[:, 2] - proposals[:, 0]
+            h = proposals[:, 3] - proposals[:, 1]
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                proposals = proposals[valid_mask]
+                scores = scores[valid_mask]
+                ids = ids[valid_mask]
+
+        # deprecate arguments warning
+        if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+            warnings.warn(
+                'In rpn_proposal or test_cfg, '
+                'nms_thr has been moved to a dict named nms as '
+                'iou_threshold, max_num has been renamed as max_per_img, '
+                'name of original arguments and the way to specify '
+                'iou_threshold of NMS will be deprecated.')
+        if 'nms' not in cfg:
+            cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+        if 'max_num' in cfg:
+            if 'max_per_img' in cfg:
+                assert cfg.max_num == cfg.max_per_img, f'You ' \
+                    f'set max_num and ' \
+                    f'max_per_img at the same time, but get {cfg.max_num} ' \
+                    f'and {cfg.max_per_img} respectively' \
+                    'Please delete max_num which will be deprecated.'
+            else:
+                cfg.max_per_img = cfg.max_num
+        if 'nms_thr' in cfg:
+            assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set' \
+                f' iou_threshold in nms and ' \
+                f'nms_thr at the same time, but get' \
+                f' {cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+                f' respectively. Please delete the nms_thr ' \
+                f'which will be deprecated.'
+
+        if proposals.numel() > 0:
+            dets, _ = batched_nms(proposals, scores, ids, cfg.nms)
+        else:
+            return proposals.new_zeros(0, 5)
+
+        return dets[:cfg.max_per_img]
+
+    def refine_bboxes(self, anchor_list, bbox_preds, img_metas):
+        """Refine bboxes through stages."""
+        num_levels = len(bbox_preds)
+        new_anchor_list = []
+        for img_id in range(len(img_metas)):
+            mlvl_anchors = []
+            for i in range(num_levels):
+                bbox_pred = bbox_preds[i][img_id].detach()
+                bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+                img_shape = img_metas[img_id]['img_shape']
+                bboxes = self.bbox_coder.decode(anchor_list[img_id][i],
+                                                bbox_pred, img_shape)
+                mlvl_anchors.append(bboxes)
+            new_anchor_list.append(mlvl_anchors)
+        return new_anchor_list
+
+
+@HEADS.register_module()
+class CascadeRPNHead(BaseDenseHead):
+    """The CascadeRPNHead will predict more accurate region proposals, which is
+    required for two-stage detectors (such as Fast/Faster R-CNN). CascadeRPN
+    consists of a sequence of RPNStage to progressively improve the accuracy of
+    the detected proposals.
+
+    More details can be found in ``https://arxiv.org/abs/1909.06720``.
+
+    Args:
+        num_stages (int): number of CascadeRPN stages.
+        stages (list[dict]): list of configs to build the stages.
+        train_cfg (list[dict]): list of configs at training time each stage.
+        test_cfg (dict): config at testing time.
+    """
+
+    def __init__(self, num_stages, stages, train_cfg, test_cfg, init_cfg=None):
+        super(CascadeRPNHead, self).__init__(init_cfg)
+        assert num_stages == len(stages)
+        self.num_stages = num_stages
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.stages = ModuleList()
+        for i in range(len(stages)):
+            train_cfg_i = train_cfg[i] if train_cfg is not None else None
+            stages[i].update(train_cfg=train_cfg_i)
+            stages[i].update(test_cfg=test_cfg)
+            self.stages.append(build_head(stages[i]))
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self):
+        """loss() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def get_bboxes(self):
+        """get_bboxes() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None):
+        """Forward train function."""
+        assert gt_labels is None, 'RPN does not require gt_labels'
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.stages[0].get_anchors(
+            featmap_sizes, img_metas, device=device)
+
+        losses = dict()
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
+                               bbox_pred, gt_bboxes, img_metas)
+            stage_loss = stage.loss(*rpn_loss_inputs)
+            for name, value in stage_loss.items():
+                losses['s{}.{}'.format(i, name)] = value
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  img_metas)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.stages[-1].get_bboxes(anchor_list, cls_score,
+                                                       bbox_pred, img_metas,
+                                                       self.test_cfg)
+            return losses, proposal_list
+
+    def simple_test_rpn(self, x, img_metas):
+        """Simple forward test function."""
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, _ = self.stages[0].get_anchors(
+            featmap_sizes, img_metas, device=device)
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  img_metas)
+
+        proposal_list = self.stages[-1].get_bboxes(anchor_list, cls_score,
+                                                   bbox_pred, img_metas,
+                                                   self.test_cfg)
+        return proposal_list
+
+    def aug_test_rpn(self, x, img_metas):
+        """Augmented forward test function."""
+        raise NotImplementedError(
+            'CascadeRPNHead does not support test-time augmentation')
diff --git a/mmdet/models/dense_heads/centernet_head.py b/mmdet/models/dense_heads/centernet_head.py
new file mode 100755
index 0000000..b9d5d2f
--- /dev/null
+++ b/mmdet/models/dense_heads/centernet_head.py
@@ -0,0 +1,412 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import bias_init_with_prob, normal_init
+from mmcv.ops import batched_nms
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply
+from mmdet.models import HEADS, build_loss
+from mmdet.models.utils import gaussian_radius, gen_gaussian_target
+from ..utils.gaussian_target import (get_local_maximum, get_topk_from_heatmap,
+                                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class CenterNetHead(BaseDenseHead, BBoxTestMixin):
+    """Objects as Points Head. CenterHead use center_point to indicate object's
+    position. Paper link <https://arxiv.org/abs/1904.07850>
+
+    Args:
+        in_channel (int): Number of channel in the input feature map.
+        feat_channel (int): Number of channel in the intermediate feature map.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_center_heatmap (dict | None): Config of center heatmap loss.
+            Default: GaussianFocalLoss.
+        loss_wh (dict | None): Config of wh loss. Default: L1Loss.
+        loss_offset (dict | None): Config of offset loss. Default: L1Loss.
+        train_cfg (dict | None): Training config. Useless in CenterNet,
+            but we keep this variable for SingleStageDetector. Default: None.
+        test_cfg (dict | None): Testing config of CenterNet. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channel,
+                 feat_channel,
+                 num_classes,
+                 loss_center_heatmap=dict(
+                     type='GaussianFocalLoss', loss_weight=1.0),
+                 loss_wh=dict(type='L1Loss', loss_weight=0.1),
+                 loss_offset=dict(type='L1Loss', loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super(CenterNetHead, self).__init__(init_cfg)
+        self.num_classes = num_classes
+        self.heatmap_head = self._build_head(in_channel, feat_channel,
+                                             num_classes)
+        self.wh_head = self._build_head(in_channel, feat_channel, 2)
+        self.offset_head = self._build_head(in_channel, feat_channel, 2)
+
+        self.loss_center_heatmap = build_loss(loss_center_heatmap)
+        self.loss_wh = build_loss(loss_wh)
+        self.loss_offset = build_loss(loss_offset)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+
+    def _build_head(self, in_channel, feat_channel, out_channel):
+        """Build head for each branch."""
+        layer = nn.Sequential(
+            nn.Conv2d(in_channel, feat_channel, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(feat_channel, out_channel, kernel_size=1))
+        return layer
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        bias_init = bias_init_with_prob(0.1)
+        self.heatmap_head[-1].bias.data.fill_(bias_init)
+        for head in [self.wh_head, self.offset_head]:
+            for m in head.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+
+    def forward(self, feats):
+        """Forward features. Notice CenterNet head does not use FPN.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            center_heatmap_preds (List[Tensor]): center predict heatmaps for
+                all levels, the channels number is num_classes.
+            wh_preds (List[Tensor]): wh predicts for all levels, the channels
+                number is 2.
+            offset_preds (List[Tensor]): offset predicts for all levels, the
+               channels number is 2.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def forward_single(self, feat):
+        """Forward feature of a single level.
+
+        Args:
+            feat (Tensor): Feature of a single level.
+
+        Returns:
+            center_heatmap_pred (Tensor): center predict heatmaps, the
+               channels number is num_classes.
+            wh_pred (Tensor): wh predicts, the channels number is 2.
+            offset_pred (Tensor): offset predicts, the channels number is 2.
+        """
+        center_heatmap_pred = self.heatmap_head(feat).sigmoid()
+        wh_pred = self.wh_head(feat)
+        offset_pred = self.offset_head(feat)
+        return center_heatmap_pred, wh_pred, offset_pred
+
+    @force_fp32(apply_to=('center_heatmap_preds', 'wh_preds', 'offset_preds'))
+    def loss(self,
+             center_heatmap_preds,
+             wh_preds,
+             offset_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): center predict heatmaps for
+               all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): wh predicts for all levels with
+               shape (B, 2, H, W).
+            offset_preds (list[Tensor]): offset predicts for all levels
+               with shape (B, 2, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            dict[str, Tensor]: which has components below:
+                - loss_center_heatmap (Tensor): loss of center heatmap.
+                - loss_wh (Tensor): loss of hw heatmap
+                - loss_offset (Tensor): loss of offset heatmap.
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        center_heatmap_pred = center_heatmap_preds[0]
+        wh_pred = wh_preds[0]
+        offset_pred = offset_preds[0]
+
+        target_result, avg_factor = self.get_targets(gt_bboxes, gt_labels,
+                                                     center_heatmap_pred.shape,
+                                                     img_metas[0]['pad_shape'])
+
+        center_heatmap_target = target_result['center_heatmap_target']
+        wh_target = target_result['wh_target']
+        offset_target = target_result['offset_target']
+        wh_offset_target_weight = target_result['wh_offset_target_weight']
+
+        # Since the channel of wh_target and offset_target is 2, the avg_factor
+        # of loss_center_heatmap is always 1/2 of loss_wh and loss_offset.
+        loss_center_heatmap = self.loss_center_heatmap(
+            center_heatmap_pred, center_heatmap_target, avg_factor=avg_factor)
+        loss_wh = self.loss_wh(
+            wh_pred,
+            wh_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        loss_offset = self.loss_offset(
+            offset_pred,
+            offset_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        return dict(
+            loss_center_heatmap=loss_center_heatmap,
+            loss_wh=loss_wh,
+            loss_offset=loss_offset)
+
+    def get_targets(self, gt_bboxes, gt_labels, feat_shape, img_shape):
+        """Compute regression and classification targets in multiple images.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box.
+            feat_shape (list[int]): feature map shape with value [B, _, H, W]
+            img_shape (list[int]): image shape in [h, w] format.
+
+        Returns:
+            tuple[dict,float]: The float value is mean avg_factor, the dict has
+               components below:
+               - center_heatmap_target (Tensor): targets of center heatmap, \
+                   shape (B, num_classes, H, W).
+               - wh_target (Tensor): targets of wh predict, shape \
+                   (B, 2, H, W).
+               - offset_target (Tensor): targets of offset predict, shape \
+                   (B, 2, H, W).
+               - wh_offset_target_weight (Tensor): weights of wh and offset \
+                   predict, shape (B, 2, H, W).
+        """
+        img_h, img_w = img_shape[:2]
+        bs, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)
+        height_ratio = float(feat_h / img_h)
+
+        center_heatmap_target = gt_bboxes[-1].new_zeros(
+            [bs, self.num_classes, feat_h, feat_w])
+        wh_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        offset_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        wh_offset_target_weight = gt_bboxes[-1].new_zeros(
+            [bs, 2, feat_h, feat_w])
+
+        for batch_id in range(bs):
+            gt_bbox = gt_bboxes[batch_id]
+            gt_label = gt_labels[batch_id]
+            center_x = (gt_bbox[:, [0]] + gt_bbox[:, [2]]) * width_ratio / 2
+            center_y = (gt_bbox[:, [1]] + gt_bbox[:, [3]]) * height_ratio / 2
+            gt_centers = torch.cat((center_x, center_y), dim=1)
+
+            for j, ct in enumerate(gt_centers):
+                ctx_int, cty_int = ct.int()
+                ctx, cty = ct
+                scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
+                scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
+                radius = gaussian_radius([scale_box_h, scale_box_w],
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                ind = gt_label[j]
+                gen_gaussian_target(center_heatmap_target[batch_id, ind],
+                                    [ctx_int, cty_int], radius)
+
+                wh_target[batch_id, 0, cty_int, ctx_int] = scale_box_w
+                wh_target[batch_id, 1, cty_int, ctx_int] = scale_box_h
+
+                offset_target[batch_id, 0, cty_int, ctx_int] = ctx - ctx_int
+                offset_target[batch_id, 1, cty_int, ctx_int] = cty - cty_int
+
+                wh_offset_target_weight[batch_id, :, cty_int, ctx_int] = 1
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        target_result = dict(
+            center_heatmap_target=center_heatmap_target,
+            wh_target=wh_target,
+            offset_target=offset_target,
+            wh_offset_target_weight=wh_offset_target_weight)
+        return target_result, avg_factor
+
+    @force_fp32(apply_to=('center_heatmap_preds', 'wh_preds', 'offset_preds'))
+    def get_bboxes(self,
+                   center_heatmap_preds,
+                   wh_preds,
+                   offset_preds,
+                   img_metas,
+                   rescale=True,
+                   with_nms=False):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): Center predict heatmaps for
+                all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): WH predicts for all levels with
+                shape (B, 2, H, W).
+            offset_preds (list[Tensor]): Offset predicts for all levels
+                with shape (B, 2, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: True.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where 5 represent
+                (tl_x, tl_y, br_x, br_y, score) and the score between 0 and 1.
+                The shape of the second tensor in the tuple is (n,), and
+                each element represents the class label of the corresponding
+                box.
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        result_list = []
+        for img_id in range(len(img_metas)):
+            result_list.append(
+                self._get_bboxes_single(
+                    center_heatmap_preds[0][img_id:img_id + 1, ...],
+                    wh_preds[0][img_id:img_id + 1, ...],
+                    offset_preds[0][img_id:img_id + 1, ...],
+                    img_metas[img_id],
+                    rescale=rescale,
+                    with_nms=with_nms))
+        return result_list
+
+    def _get_bboxes_single(self,
+                           center_heatmap_pred,
+                           wh_pred,
+                           offset_pred,
+                           img_meta,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs of a single image into bbox results.
+
+        Args:
+            center_heatmap_pred (Tensor): Center heatmap for current level with
+                shape (1, num_classes, H, W).
+            wh_pred (Tensor): WH heatmap for current level with shape
+                (1, num_classes, H, W).
+            offset_pred (Tensor): Offset for current level with shape
+                (1, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor, Tensor]: The first item is an (n, 5) tensor, where
+                5 represent (tl_x, tl_y, br_x, br_y, score) and the score
+                between 0 and 1. The shape of the second tensor in the tuple
+                is (n,), and each element represents the class label of the
+                corresponding box.
+        """
+        batch_det_bboxes, batch_labels = self.decode_heatmap(
+            center_heatmap_pred,
+            wh_pred,
+            offset_pred,
+            img_meta['batch_input_shape'],
+            k=self.test_cfg.topk,
+            kernel=self.test_cfg.local_maximum_kernel)
+
+        det_bboxes = batch_det_bboxes.view([-1, 5])
+        det_labels = batch_labels.view(-1)
+
+        batch_border = det_bboxes.new_tensor(img_meta['border'])[...,
+                                                                 [2, 0, 2, 0]]
+        det_bboxes[..., :4] -= batch_border
+
+        if rescale:
+            det_bboxes[..., :4] /= det_bboxes.new_tensor(
+                img_meta['scale_factor'])
+
+        if with_nms:
+            det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
+                                                      self.test_cfg)
+        return det_bboxes, det_labels
+
+    def decode_heatmap(self,
+                       center_heatmap_pred,
+                       wh_pred,
+                       offset_pred,
+                       img_shape,
+                       k=100,
+                       kernel=3):
+        """Transform outputs into detections raw bbox prediction.
+
+        Args:
+            center_heatmap_pred (Tensor): center predict heatmap,
+               shape (B, num_classes, H, W).
+            wh_pred (Tensor): wh predict, shape (B, 2, H, W).
+            offset_pred (Tensor): offset predict, shape (B, 2, H, W).
+            img_shape (list[int]): image shape in [h, w] format.
+            k (int): Get top k center keypoints from heatmap. Default 100.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+               Default 3.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of CenterNetHead, containing
+               the following Tensors:
+
+              - batch_bboxes (Tensor): Coords of each box with shape (B, k, 5)
+              - batch_topk_labels (Tensor): Categories of each box with \
+                  shape (B, k)
+        """
+        height, width = center_heatmap_pred.shape[2:]
+        inp_h, inp_w = img_shape
+
+        center_heatmap_pred = get_local_maximum(
+            center_heatmap_pred, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=k)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        wh = transpose_and_gather_feat(wh_pred, batch_index)
+        offset = transpose_and_gather_feat(offset_pred, batch_index)
+        topk_xs = topk_xs + offset[..., 0]
+        topk_ys = topk_ys + offset[..., 1]
+        tl_x = (topk_xs - wh[..., 0] / 2) * (inp_w / width)
+        tl_y = (topk_ys - wh[..., 1] / 2) * (inp_h / height)
+        br_x = (topk_xs + wh[..., 0] / 2) * (inp_w / width)
+        br_y = (topk_ys + wh[..., 1] / 2) * (inp_h / height)
+
+        batch_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], dim=2)
+        batch_bboxes = torch.cat((batch_bboxes, batch_scores[..., None]),
+                                 dim=-1)
+        return batch_bboxes, batch_topk_labels
+
+    def _bboxes_nms(self, bboxes, labels, cfg):
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
diff --git a/mmdet/models/dense_heads/centripetal_head.py b/mmdet/models/dense_heads/centripetal_head.py
new file mode 100755
index 0000000..ebc721b
--- /dev/null
+++ b/mmdet/models/dense_heads/centripetal_head.py
@@ -0,0 +1,430 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.ops import DeformConv2d
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_loss
+from .corner_head import CornerHead
+
+
+@HEADS.register_module()
+class CentripetalHead(CornerHead):
+    """Head of CentripetalNet: Pursuing High-quality Keypoint Pairs for Object
+    Detection.
+
+    CentripetalHead inherits from :class:`CornerHead`. It removes the
+    embedding branch and adds guiding shift and centripetal shift branches.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2003.09119>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module. 2
+            for HourglassNet-104 and 1 for HourglassNet-52. HourglassNet-104
+            outputs the final feature and intermediate supervision feature and
+            HourglassNet-52 only outputs the final feature. Default: 2.
+        corner_emb_channels (int): Channel of embedding vector. Default: 1.
+        train_cfg (dict | None): Training config. Useless in CornerHead,
+            but we keep this variable for SingleStageDetector. Default: None.
+        test_cfg (dict | None): Testing config of CornerHead. Default: None.
+        loss_heatmap (dict | None): Config of corner heatmap loss. Default:
+            GaussianFocalLoss.
+        loss_embedding (dict | None): Config of corner embedding loss. Default:
+            AssociativeEmbeddingLoss.
+        loss_offset (dict | None): Config of corner offset loss. Default:
+            SmoothL1Loss.
+        loss_guiding_shift (dict): Config of guiding shift loss. Default:
+            SmoothL1Loss.
+        loss_centripetal_shift (dict): Config of centripetal shift loss.
+            Default: SmoothL1Loss.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 *args,
+                 centripetal_shift_channels=2,
+                 guiding_shift_channels=2,
+                 feat_adaption_conv_kernel=3,
+                 loss_guiding_shift=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=0.05),
+                 loss_centripetal_shift=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        assert centripetal_shift_channels == 2, (
+            'CentripetalHead only support centripetal_shift_channels == 2')
+        self.centripetal_shift_channels = centripetal_shift_channels
+        assert guiding_shift_channels == 2, (
+            'CentripetalHead only support guiding_shift_channels == 2')
+        self.guiding_shift_channels = guiding_shift_channels
+        self.feat_adaption_conv_kernel = feat_adaption_conv_kernel
+        super(CentripetalHead, self).__init__(
+            *args, init_cfg=init_cfg, **kwargs)
+        self.loss_guiding_shift = build_loss(loss_guiding_shift)
+        self.loss_centripetal_shift = build_loss(loss_centripetal_shift)
+
+    def _init_centripetal_layers(self):
+        """Initialize centripetal layers.
+
+        Including feature adaption deform convs (feat_adaption), deform offset
+        prediction convs (dcn_off), guiding shift (guiding_shift) and
+        centripetal shift ( centripetal_shift). Each branch has two parts:
+        prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_feat_adaption = nn.ModuleList()
+        self.br_feat_adaption = nn.ModuleList()
+        self.tl_dcn_offset = nn.ModuleList()
+        self.br_dcn_offset = nn.ModuleList()
+        self.tl_guiding_shift = nn.ModuleList()
+        self.br_guiding_shift = nn.ModuleList()
+        self.tl_centripetal_shift = nn.ModuleList()
+        self.br_centripetal_shift = nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+            self.br_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+
+            self.tl_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+
+            self.tl_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+            self.br_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+
+            self.tl_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self):
+        """Initialize layers for CentripetalHead.
+
+        Including two parts: CornerHead layers and CentripetalHead layers
+        """
+        super()._init_layers()  # using _init_layers in CornerHead
+        self._init_centripetal_layers()
+
+    def init_weights(self):
+        super(CentripetalHead, self).init_weights()
+        for i in range(self.num_feat_levels):
+            normal_init(self.tl_feat_adaption[i], std=0.01)
+            normal_init(self.br_feat_adaption[i], std=0.01)
+            normal_init(self.tl_dcn_offset[i].conv, std=0.1)
+            normal_init(self.br_dcn_offset[i].conv, std=0.1)
+            _ = [x.conv.reset_parameters() for x in self.tl_guiding_shift[i]]
+            _ = [x.conv.reset_parameters() for x in self.br_guiding_shift[i]]
+            _ = [
+                x.conv.reset_parameters() for x in self.tl_centripetal_shift[i]
+            ]
+            _ = [
+                x.conv.reset_parameters() for x in self.br_centripetal_shift[i]
+            ]
+
+    def forward_single(self, x, lvl_ind):
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+
+        Returns:
+            tuple[Tensor]: A tuple of CentripetalHead's output for current
+            feature level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_guiding_shift (Tensor): Predicted top-left guiding shift
+                  heatmap.
+                - br_guiding_shift (Tensor): Predicted bottom-right guiding
+                  shift heatmap.
+                - tl_centripetal_shift (Tensor): Predicted top-left centripetal
+                  shift heatmap.
+                - br_centripetal_shift (Tensor): Predicted bottom-right
+                  centripetal shift heatmap.
+        """
+        tl_heat, br_heat, _, _, tl_off, br_off, tl_pool, br_pool = super(
+        ).forward_single(
+            x, lvl_ind, return_pool=True)
+
+        tl_guiding_shift = self.tl_guiding_shift[lvl_ind](tl_pool)
+        br_guiding_shift = self.br_guiding_shift[lvl_ind](br_pool)
+
+        tl_dcn_offset = self.tl_dcn_offset[lvl_ind](tl_guiding_shift.detach())
+        br_dcn_offset = self.br_dcn_offset[lvl_ind](br_guiding_shift.detach())
+
+        tl_feat_adaption = self.tl_feat_adaption[lvl_ind](tl_pool,
+                                                          tl_dcn_offset)
+        br_feat_adaption = self.br_feat_adaption[lvl_ind](br_pool,
+                                                          br_dcn_offset)
+
+        tl_centripetal_shift = self.tl_centripetal_shift[lvl_ind](
+            tl_feat_adaption)
+        br_centripetal_shift = self.br_centripetal_shift[lvl_ind](
+            br_feat_adaption)
+
+        result_list = [
+            tl_heat, br_heat, tl_off, br_off, tl_guiding_shift,
+            br_guiding_shift, tl_centripetal_shift, br_centripetal_shift
+        ]
+        return result_list
+
+    @force_fp32()
+    def loss(self,
+             tl_heats,
+             br_heats,
+             tl_offs,
+             br_offs,
+             tl_guiding_shifts,
+             br_guiding_shifts,
+             tl_centripetal_shifts,
+             br_centripetal_shifts,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [left, top, right, bottom] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+                - guiding_loss (list[Tensor]): Guiding shift losses of all
+                  feature levels.
+                - centripetal_loss (list[Tensor]): Centripetal shift losses of
+                  all feature levels.
+        """
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            img_metas[0]['pad_shape'],
+            with_corner_emb=self.with_corner_emb,
+            with_guiding_shift=True,
+            with_centripetal_shift=True)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        [det_losses, off_losses, guiding_losses, centripetal_losses
+         ] = multi_apply(self.loss_single, tl_heats, br_heats, tl_offs,
+                         br_offs, tl_guiding_shifts, br_guiding_shifts,
+                         tl_centripetal_shifts, br_centripetal_shifts,
+                         mlvl_targets)
+        loss_dict = dict(
+            det_loss=det_losses,
+            off_loss=off_losses,
+            guiding_loss=guiding_losses,
+            centripetal_loss=centripetal_losses)
+        return loss_dict
+
+    def loss_single(self, tl_hmp, br_hmp, tl_off, br_off, tl_guiding_shift,
+                    br_guiding_shift, tl_centripetal_shift,
+                    br_centripetal_shift, targets):
+        """Compute losses for single level.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_guiding_shift (Tensor): Top-left guiding shift for current level
+                with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shift (Tensor): Bottom-right guiding shift for current
+                level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shift (Tensor): Top-left centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            br_centripetal_shift (Tensor): Bottom-right centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - off_loss (Tensor): Corner offset loss.
+                - guiding_loss (Tensor): Guiding shift loss.
+                - centripetal_loss (Tensor): Centripetal shift loss.
+        """
+        targets['corner_embedding'] = None
+
+        det_loss, _, _, off_loss = super().loss_single(tl_hmp, br_hmp, None,
+                                                       None, tl_off, br_off,
+                                                       targets)
+
+        gt_tl_guiding_shift = targets['topleft_guiding_shift']
+        gt_br_guiding_shift = targets['bottomright_guiding_shift']
+        gt_tl_centripetal_shift = targets['topleft_centripetal_shift']
+        gt_br_centripetal_shift = targets['bottomright_centripetal_shift']
+
+        gt_tl_heatmap = targets['topleft_heatmap']
+        gt_br_heatmap = targets['bottomright_heatmap']
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_mask = gt_tl_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_heatmap)
+        br_mask = gt_br_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_heatmap)
+
+        # Guiding shift loss
+        tl_guiding_loss = self.loss_guiding_shift(
+            tl_guiding_shift,
+            gt_tl_guiding_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_guiding_loss = self.loss_guiding_shift(
+            br_guiding_shift,
+            gt_br_guiding_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        guiding_loss = (tl_guiding_loss + br_guiding_loss) / 2.0
+        # Centripetal shift loss
+        tl_centripetal_loss = self.loss_centripetal_shift(
+            tl_centripetal_shift,
+            gt_tl_centripetal_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_centripetal_loss = self.loss_centripetal_shift(
+            br_centripetal_shift,
+            gt_br_centripetal_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        centripetal_loss = (tl_centripetal_loss + br_centripetal_loss) / 2.0
+
+        return det_loss, off_loss, guiding_loss, centripetal_loss
+
+    @force_fp32()
+    def get_bboxes(self,
+                   tl_heats,
+                   br_heats,
+                   tl_offs,
+                   br_offs,
+                   tl_guiding_shifts,
+                   br_guiding_shifts,
+                   tl_centripetal_shifts,
+                   br_centripetal_shifts,
+                   img_metas,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W). Useless in
+                this function, we keep this arg because it's the raw output
+                from CentripetalHead.
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+                Useless in this function, we keep this arg because it's the
+                raw output from CentripetalHead.
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(img_metas)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            result_list.append(
+                self._get_bboxes_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    img_metas[img_id],
+                    tl_emb=None,
+                    br_emb=None,
+                    tl_centripetal_shift=tl_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    br_centripetal_shift=br_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
diff --git a/mmdet/models/dense_heads/corner_head.py b/mmdet/models/dense_heads/corner_head.py
new file mode 100755
index 0000000..c6a2866
--- /dev/null
+++ b/mmdet/models/dense_heads/corner_head.py
@@ -0,0 +1,1086 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+from math import ceil, log
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob
+from mmcv.ops import CornerPool, batched_nms
+from mmcv.runner import BaseModule, force_fp32
+
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_loss
+from ..utils import gaussian_radius, gen_gaussian_target
+from ..utils.gaussian_target import (gather_feat, get_local_maximum,
+                                     get_topk_from_heatmap,
+                                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+class BiCornerPool(BaseModule):
+    """Bidirectional Corner Pooling Module (TopLeft, BottomRight, etc.)
+
+    Args:
+        in_channels (int): Input channels of module.
+        out_channels (int): Output channels of module.
+        feat_channels (int): Feature channels of module.
+        directions (list[str]): Directions of two CornerPools.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 directions,
+                 feat_channels=128,
+                 out_channels=128,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 init_cfg=None):
+        super(BiCornerPool, self).__init__(init_cfg)
+        self.direction1_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+        self.direction2_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.aftpool_conv = ConvModule(
+            feat_channels,
+            out_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv1 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.conv2 = ConvModule(
+            in_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.direction1_pool = CornerPool(directions[0])
+        self.direction2_pool = CornerPool(directions[1])
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward features from the upstream network.
+
+        Args:
+            x (tensor): Input feature of BiCornerPool.
+
+        Returns:
+            conv2 (tensor): Output feature of BiCornerPool.
+        """
+        direction1_conv = self.direction1_conv(x)
+        direction2_conv = self.direction2_conv(x)
+        direction1_feat = self.direction1_pool(direction1_conv)
+        direction2_feat = self.direction2_pool(direction2_conv)
+        aftpool_conv = self.aftpool_conv(direction1_feat + direction2_feat)
+        conv1 = self.conv1(x)
+        relu = self.relu(aftpool_conv + conv1)
+        conv2 = self.conv2(relu)
+        return conv2
+
+
+@HEADS.register_module()
+class CornerHead(BaseDenseHead, BBoxTestMixin):
+    """Head of CornerNet: Detecting Objects as Paired Keypoints.
+
+    Code is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/
+    kp.py#L73>`_ .
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module. 2
+            for HourglassNet-104 and 1 for HourglassNet-52. Because
+            HourglassNet-104 outputs the final feature and intermediate
+            supervision feature and HourglassNet-52 only outputs the final
+            feature. Default: 2.
+        corner_emb_channels (int): Channel of embedding vector. Default: 1.
+        train_cfg (dict | None): Training config. Useless in CornerHead,
+            but we keep this variable for SingleStageDetector. Default: None.
+        test_cfg (dict | None): Testing config of CornerHead. Default: None.
+        loss_heatmap (dict | None): Config of corner heatmap loss. Default:
+            GaussianFocalLoss.
+        loss_embedding (dict | None): Config of corner embedding loss. Default:
+            AssociativeEmbeddingLoss.
+        loss_offset (dict | None): Config of corner offset loss. Default:
+            SmoothL1Loss.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_feat_levels=2,
+                 corner_emb_channels=1,
+                 train_cfg=None,
+                 test_cfg=None,
+                 loss_heatmap=dict(
+                     type='GaussianFocalLoss',
+                     alpha=2.0,
+                     gamma=4.0,
+                     loss_weight=1),
+                 loss_embedding=dict(
+                     type='AssociativeEmbeddingLoss',
+                     pull_weight=0.25,
+                     push_weight=0.25),
+                 loss_offset=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(CornerHead, self).__init__(init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.corner_emb_channels = corner_emb_channels
+        self.with_corner_emb = self.corner_emb_channels > 0
+        self.corner_offset_channels = 2
+        self.num_feat_levels = num_feat_levels
+        self.loss_heatmap = build_loss(
+            loss_heatmap) if loss_heatmap is not None else None
+        self.loss_embedding = build_loss(
+            loss_embedding) if loss_embedding is not None else None
+        self.loss_offset = build_loss(
+            loss_offset) if loss_offset is not None else None
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.fp16_enabled = False
+        self._init_layers()
+
+    def _make_layers(self, out_channels, in_channels=256, feat_channels=256):
+        """Initialize conv sequential for CornerHead."""
+        return nn.Sequential(
+            ConvModule(in_channels, feat_channels, 3, padding=1),
+            ConvModule(
+                feat_channels, out_channels, 1, norm_cfg=None, act_cfg=None))
+
+    def _init_corner_kpt_layers(self):
+        """Initialize corner keypoint layers.
+
+        Including corner heatmap branch and corner offset branch. Each branch
+        has two parts: prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_pool, self.br_pool = nn.ModuleList(), nn.ModuleList()
+        self.tl_heat, self.br_heat = nn.ModuleList(), nn.ModuleList()
+        self.tl_off, self.br_off = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['top', 'left'],
+                    out_channels=self.in_channels))
+            self.br_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['bottom', 'right'],
+                    out_channels=self.in_channels))
+
+            self.tl_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+            self.br_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+
+            self.tl_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+            self.br_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+
+    def _init_corner_emb_layers(self):
+        """Initialize corner embedding layers.
+
+        Only include corner embedding branch with two parts: prefix `tl_` for
+        top-left and `br_` for bottom-right.
+        """
+        self.tl_emb, self.br_emb = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+            self.br_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self):
+        """Initialize layers for CornerHead.
+
+        Including two parts: corner keypoint layers and corner embedding layers
+        """
+        self._init_corner_kpt_layers()
+        if self.with_corner_emb:
+            self._init_corner_emb_layers()
+
+    def init_weights(self):
+        super(CornerHead, self).init_weights()
+        bias_init = bias_init_with_prob(0.1)
+        for i in range(self.num_feat_levels):
+            # The initialization of parameters are different between
+            # nn.Conv2d and ConvModule. Our experiments show that
+            # using the original initialization of nn.Conv2d increases
+            # the final mAP by about 0.2%
+            self.tl_heat[i][-1].conv.reset_parameters()
+            self.tl_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.br_heat[i][-1].conv.reset_parameters()
+            self.br_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.tl_off[i][-1].conv.reset_parameters()
+            self.br_off[i][-1].conv.reset_parameters()
+            if self.with_corner_emb:
+                self.tl_emb[i][-1].conv.reset_parameters()
+                self.br_emb[i][-1].conv.reset_parameters()
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of corner heatmaps, offset heatmaps and
+            embedding heatmaps.
+                - tl_heats (list[Tensor]): Top-left corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - br_heats (list[Tensor]): Bottom-right corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - tl_embs (list[Tensor] | list[None]): Top-left embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - br_embs (list[Tensor] | list[None]): Bottom-right embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - tl_offs (list[Tensor]): Top-left offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+                - br_offs (list[Tensor]): Bottom-right offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+        """
+        lvl_ind = list(range(self.num_feat_levels))
+        return multi_apply(self.forward_single, feats, lvl_ind)
+
+    def forward_single(self, x, lvl_ind, return_pool=False):
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+            return_pool (bool): Return corner pool feature or not.
+
+        Returns:
+            tuple[Tensor]: A tuple of CornerHead's output for current feature
+            level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_emb (Tensor | None): Predicted top-left embedding heatmap.
+                  None for `self.with_corner_emb == False`.
+                - br_emb (Tensor | None): Predicted bottom-right embedding
+                  heatmap. None for `self.with_corner_emb == False`.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_pool (Tensor): Top-left corner pool feature. Not must
+                  have.
+                - br_pool (Tensor): Bottom-right corner pool feature. Not must
+                  have.
+        """
+        tl_pool = self.tl_pool[lvl_ind](x)
+        tl_heat = self.tl_heat[lvl_ind](tl_pool)
+        br_pool = self.br_pool[lvl_ind](x)
+        br_heat = self.br_heat[lvl_ind](br_pool)
+
+        tl_emb, br_emb = None, None
+        if self.with_corner_emb:
+            tl_emb = self.tl_emb[lvl_ind](tl_pool)
+            br_emb = self.br_emb[lvl_ind](br_pool)
+
+        tl_off = self.tl_off[lvl_ind](tl_pool)
+        br_off = self.br_off[lvl_ind](br_pool)
+
+        result_list = [tl_heat, br_heat, tl_emb, br_emb, tl_off, br_off]
+        if return_pool:
+            result_list.append(tl_pool)
+            result_list.append(br_pool)
+
+        return result_list
+
+    def get_targets(self,
+                    gt_bboxes,
+                    gt_labels,
+                    feat_shape,
+                    img_shape,
+                    with_corner_emb=False,
+                    with_guiding_shift=False,
+                    with_centripetal_shift=False):
+        """Generate corner targets.
+
+        Including corner heatmap, corner offset.
+
+        Optional: corner embedding, corner guiding shift, centripetal shift.
+
+        For CornerNet, we generate corner heatmap, corner offset and corner
+        embedding from this function.
+
+        For CentripetalNet, we generate corner heatmap, corner offset, guiding
+        shift and centripetal shift from this function.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image, each
+                has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box, each has
+                shape (num_gt,).
+            feat_shape (list[int]): Shape of output feature,
+                [batch, channel, height, width].
+            img_shape (list[int]): Shape of input image,
+                [height, width, channel].
+            with_corner_emb (bool): Generate corner embedding target or not.
+                Default: False.
+            with_guiding_shift (bool): Generate guiding shift target or not.
+                Default: False.
+            with_centripetal_shift (bool): Generate centripetal shift target or
+                not. Default: False.
+
+        Returns:
+            dict: Ground truth of corner heatmap, corner offset, corner
+            embedding, guiding shift and centripetal shift. Containing the
+            following keys:
+
+                - topleft_heatmap (Tensor): Ground truth top-left corner
+                  heatmap.
+                - bottomright_heatmap (Tensor): Ground truth bottom-right
+                  corner heatmap.
+                - topleft_offset (Tensor): Ground truth top-left corner offset.
+                - bottomright_offset (Tensor): Ground truth bottom-right corner
+                  offset.
+                - corner_embedding (list[list[list[int]]]): Ground truth corner
+                  embedding. Not must have.
+                - topleft_guiding_shift (Tensor): Ground truth top-left corner
+                  guiding shift. Not must have.
+                - bottomright_guiding_shift (Tensor): Ground truth bottom-right
+                  corner guiding shift. Not must have.
+                - topleft_centripetal_shift (Tensor): Ground truth top-left
+                  corner centripetal shift. Not must have.
+                - bottomright_centripetal_shift (Tensor): Ground truth
+                  bottom-right corner centripetal shift. Not must have.
+        """
+        batch_size, _, height, width = feat_shape
+        img_h, img_w = img_shape[:2]
+
+        width_ratio = float(width / img_w)
+        height_ratio = float(height / img_h)
+
+        gt_tl_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_br_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_tl_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+        gt_br_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+
+        if with_corner_emb:
+            match = []
+
+        # Guiding shift is a kind of offset, from center to corner
+        if with_guiding_shift:
+            gt_tl_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+        # Centripetal shift is also a kind of offset, from center to corner
+        # and normalized by log.
+        if with_centripetal_shift:
+            gt_tl_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+
+        for batch_id in range(batch_size):
+            # Ground truth of corner embedding per image is a list of coord set
+            corner_match = []
+            for box_id in range(len(gt_labels[batch_id])):
+                left, top, right, bottom = gt_bboxes[batch_id][box_id]
+                center_x = (left + right) / 2.0
+                center_y = (top + bottom) / 2.0
+                label = gt_labels[batch_id][box_id]
+
+                # Use coords in the feature level to generate ground truth
+                scale_left = left * width_ratio
+                scale_right = right * width_ratio
+                scale_top = top * height_ratio
+                scale_bottom = bottom * height_ratio
+                scale_center_x = center_x * width_ratio
+                scale_center_y = center_y * height_ratio
+
+                # Int coords on feature map/ground truth tensor
+                left_idx = int(min(scale_left, width - 1))
+                right_idx = int(min(scale_right, width - 1))
+                top_idx = int(min(scale_top, height - 1))
+                bottom_idx = int(min(scale_bottom, height - 1))
+
+                # Generate gaussian heatmap
+                scale_box_width = ceil(scale_right - scale_left)
+                scale_box_height = ceil(scale_bottom - scale_top)
+                radius = gaussian_radius((scale_box_height, scale_box_width),
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                gt_tl_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_tl_heatmap[batch_id, label], [left_idx, top_idx],
+                    radius)
+                gt_br_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_br_heatmap[batch_id, label], [right_idx, bottom_idx],
+                    radius)
+
+                # Generate corner offset
+                left_offset = scale_left - left_idx
+                top_offset = scale_top - top_idx
+                right_offset = scale_right - right_idx
+                bottom_offset = scale_bottom - bottom_idx
+                gt_tl_offset[batch_id, 0, top_idx, left_idx] = left_offset
+                gt_tl_offset[batch_id, 1, top_idx, left_idx] = top_offset
+                gt_br_offset[batch_id, 0, bottom_idx, right_idx] = right_offset
+                gt_br_offset[batch_id, 1, bottom_idx,
+                             right_idx] = bottom_offset
+
+                # Generate corner embedding
+                if with_corner_emb:
+                    corner_match.append([[top_idx, left_idx],
+                                         [bottom_idx, right_idx]])
+                # Generate guiding shift
+                if with_guiding_shift:
+                    gt_tl_guiding_shift[batch_id, 0, top_idx,
+                                        left_idx] = scale_center_x - left_idx
+                    gt_tl_guiding_shift[batch_id, 1, top_idx,
+                                        left_idx] = scale_center_y - top_idx
+                    gt_br_guiding_shift[batch_id, 0, bottom_idx,
+                                        right_idx] = right_idx - scale_center_x
+                    gt_br_guiding_shift[
+                        batch_id, 1, bottom_idx,
+                        right_idx] = bottom_idx - scale_center_y
+                # Generate centripetal shift
+                if with_centripetal_shift:
+                    gt_tl_centripetal_shift[batch_id, 0, top_idx,
+                                            left_idx] = log(scale_center_x -
+                                                            scale_left)
+                    gt_tl_centripetal_shift[batch_id, 1, top_idx,
+                                            left_idx] = log(scale_center_y -
+                                                            scale_top)
+                    gt_br_centripetal_shift[batch_id, 0, bottom_idx,
+                                            right_idx] = log(scale_right -
+                                                             scale_center_x)
+                    gt_br_centripetal_shift[batch_id, 1, bottom_idx,
+                                            right_idx] = log(scale_bottom -
+                                                             scale_center_y)
+
+            if with_corner_emb:
+                match.append(corner_match)
+
+        target_result = dict(
+            topleft_heatmap=gt_tl_heatmap,
+            topleft_offset=gt_tl_offset,
+            bottomright_heatmap=gt_br_heatmap,
+            bottomright_offset=gt_br_offset)
+
+        if with_corner_emb:
+            target_result.update(corner_embedding=match)
+        if with_guiding_shift:
+            target_result.update(
+                topleft_guiding_shift=gt_tl_guiding_shift,
+                bottomright_guiding_shift=gt_br_guiding_shift)
+        if with_centripetal_shift:
+            target_result.update(
+                topleft_centripetal_shift=gt_tl_centripetal_shift,
+                bottomright_centripetal_shift=gt_br_centripetal_shift)
+
+        return target_result
+
+    @force_fp32()
+    def loss(self,
+             tl_heats,
+             br_heats,
+             tl_embs,
+             br_embs,
+             tl_offs,
+             br_offs,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [left, top, right, bottom] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - pull_loss (list[Tensor]): Part one of AssociativeEmbedding
+                  losses of all feature levels.
+                - push_loss (list[Tensor]): Part two of AssociativeEmbedding
+                  losses of all feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+        """
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            img_metas[0]['pad_shape'],
+            with_corner_emb=self.with_corner_emb)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        det_losses, pull_losses, push_losses, off_losses = multi_apply(
+            self.loss_single, tl_heats, br_heats, tl_embs, br_embs, tl_offs,
+            br_offs, mlvl_targets)
+        loss_dict = dict(det_loss=det_losses, off_loss=off_losses)
+        if self.with_corner_emb:
+            loss_dict.update(pull_loss=pull_losses, push_loss=push_losses)
+        return loss_dict
+
+    def loss_single(self, tl_hmp, br_hmp, tl_emb, br_emb, tl_off, br_off,
+                    targets):
+        """Compute losses for single level.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_emb (Tensor): Top-left corner embedding for current level with
+                shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor): Bottom-right corner embedding for current level
+                with shape (N, corner_emb_channels, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - pull_loss (Tensor): Part one of AssociativeEmbedding loss.
+                - push_loss (Tensor): Part two of AssociativeEmbedding loss.
+                - off_loss (Tensor): Corner offset loss.
+        """
+        gt_tl_hmp = targets['topleft_heatmap']
+        gt_br_hmp = targets['bottomright_heatmap']
+        gt_tl_off = targets['topleft_offset']
+        gt_br_off = targets['bottomright_offset']
+        gt_embedding = targets['corner_embedding']
+
+        # Detection loss
+        tl_det_loss = self.loss_heatmap(
+            tl_hmp.sigmoid(),
+            gt_tl_hmp,
+            avg_factor=max(1,
+                           gt_tl_hmp.eq(1).sum()))
+        br_det_loss = self.loss_heatmap(
+            br_hmp.sigmoid(),
+            gt_br_hmp,
+            avg_factor=max(1,
+                           gt_br_hmp.eq(1).sum()))
+        det_loss = (tl_det_loss + br_det_loss) / 2.0
+
+        # AssociativeEmbedding loss
+        if self.with_corner_emb and self.loss_embedding is not None:
+            pull_loss, push_loss = self.loss_embedding(tl_emb, br_emb,
+                                                       gt_embedding)
+        else:
+            pull_loss, push_loss = None, None
+
+        # Offset loss
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_off_mask = gt_tl_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_hmp)
+        br_off_mask = gt_br_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_hmp)
+        tl_off_loss = self.loss_offset(
+            tl_off,
+            gt_tl_off,
+            tl_off_mask,
+            avg_factor=max(1, tl_off_mask.sum()))
+        br_off_loss = self.loss_offset(
+            br_off,
+            gt_br_off,
+            br_off_mask,
+            avg_factor=max(1, br_off_mask.sum()))
+
+        off_loss = (tl_off_loss + br_off_loss) / 2.0
+
+        return det_loss, pull_loss, push_loss, off_loss
+
+    @force_fp32()
+    def get_bboxes(self,
+                   tl_heats,
+                   br_heats,
+                   tl_embs,
+                   br_embs,
+                   tl_offs,
+                   br_offs,
+                   img_metas,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(img_metas)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            result_list.append(
+                self._get_bboxes_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    img_metas[img_id],
+                    tl_emb=tl_embs[-1][img_id:img_id + 1, :],
+                    br_emb=br_embs[-1][img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
+
+    def _get_bboxes_single(self,
+                           tl_heat,
+                           br_heat,
+                           tl_off,
+                           br_off,
+                           img_meta,
+                           tl_emb=None,
+                           br_emb=None,
+                           tl_centripetal_shift=None,
+                           br_centripetal_shift=None,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            tl_emb (Tensor): Top-left corner embedding for current level with
+                shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor): Bottom-right corner embedding for current level
+                with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift: Top-left corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            br_centripetal_shift: Bottom-right corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+        """
+        if isinstance(img_meta, (list, tuple)):
+            img_meta = img_meta[0]
+
+        batch_bboxes, batch_scores, batch_clses = self.decode_heatmap(
+            tl_heat=tl_heat.sigmoid(),
+            br_heat=br_heat.sigmoid(),
+            tl_off=tl_off,
+            br_off=br_off,
+            tl_emb=tl_emb,
+            br_emb=br_emb,
+            tl_centripetal_shift=tl_centripetal_shift,
+            br_centripetal_shift=br_centripetal_shift,
+            img_meta=img_meta,
+            k=self.test_cfg.corner_topk,
+            kernel=self.test_cfg.local_maximum_kernel,
+            distance_threshold=self.test_cfg.distance_threshold)
+
+        if rescale:
+            batch_bboxes /= batch_bboxes.new_tensor(img_meta['scale_factor'])
+
+        bboxes = batch_bboxes.view([-1, 4])
+        scores = batch_scores.view(-1)
+        clses = batch_clses.view(-1)
+
+        detections = torch.cat([bboxes, scores.unsqueeze(-1)], -1)
+        keepinds = (detections[:, -1] > -0.1)
+        detections = detections[keepinds]
+        labels = clses[keepinds]
+
+        if with_nms:
+            detections, labels = self._bboxes_nms(detections, labels,
+                                                  self.test_cfg)
+
+        return detections, labels
+
+    def _bboxes_nms(self, bboxes, labels, cfg):
+        if 'nms_cfg' in cfg:
+            warning.warn('nms_cfg in test_cfg will be deprecated. '
+                         'Please rename it as nms')
+        if 'nms' not in cfg:
+            cfg.nms = cfg.nms_cfg
+
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
+
+    def decode_heatmap(self,
+                       tl_heat,
+                       br_heat,
+                       tl_off,
+                       br_off,
+                       tl_emb=None,
+                       br_emb=None,
+                       tl_centripetal_shift=None,
+                       br_centripetal_shift=None,
+                       img_meta=None,
+                       k=100,
+                       kernel=3,
+                       distance_threshold=0.5,
+                       num_dets=1000):
+        """Transform outputs for a single batch item into raw bbox predictions.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_emb (Tensor | None): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor | None): Bottom-right corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift (Tensor | None): Top-left centripetal shift
+                for current level with shape (N, 2, H, W).
+            br_centripetal_shift (Tensor | None): Bottom-right centripetal
+                shift for current level with shape (N, 2, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            k (int): Get top k corner keypoints from heatmap.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+            distance_threshold (float): Distance threshold. Top-left and
+                bottom-right corner keypoints with feature distance less than
+                the threshold will be regarded as keypoints from same object.
+            num_dets (int): Num of raw boxes before doing nms.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of CornerHead, containing the
+            following Tensors:
+
+            - bboxes (Tensor): Coords of each box.
+            - scores (Tensor): Scores of each box.
+            - clses (Tensor): Categories of each box.
+        """
+        with_embedding = tl_emb is not None and br_emb is not None
+        with_centripetal_shift = (
+            tl_centripetal_shift is not None
+            and br_centripetal_shift is not None)
+        assert with_embedding + with_centripetal_shift == 1
+        batch, _, height, width = tl_heat.size()
+        if torch.onnx.is_in_onnx_export():
+            inp_h, inp_w = img_meta['pad_shape_for_onnx'][:2]
+        else:
+            inp_h, inp_w, _ = img_meta['pad_shape']
+
+        # perform nms on heatmaps
+        tl_heat = get_local_maximum(tl_heat, kernel=kernel)
+        br_heat = get_local_maximum(br_heat, kernel=kernel)
+
+        tl_scores, tl_inds, tl_clses, tl_ys, tl_xs = get_topk_from_heatmap(
+            tl_heat, k=k)
+        br_scores, br_inds, br_clses, br_ys, br_xs = get_topk_from_heatmap(
+            br_heat, k=k)
+
+        # We use repeat instead of expand here because expand is a
+        # shallow-copy function. Thus it could cause unexpected testing result
+        # sometimes. Using expand will decrease about 10% mAP during testing
+        # compared to repeat.
+        tl_ys = tl_ys.view(batch, k, 1).repeat(1, 1, k)
+        tl_xs = tl_xs.view(batch, k, 1).repeat(1, 1, k)
+        br_ys = br_ys.view(batch, 1, k).repeat(1, k, 1)
+        br_xs = br_xs.view(batch, 1, k).repeat(1, k, 1)
+
+        tl_off = transpose_and_gather_feat(tl_off, tl_inds)
+        tl_off = tl_off.view(batch, k, 1, 2)
+        br_off = transpose_and_gather_feat(br_off, br_inds)
+        br_off = br_off.view(batch, 1, k, 2)
+
+        tl_xs = tl_xs + tl_off[..., 0]
+        tl_ys = tl_ys + tl_off[..., 1]
+        br_xs = br_xs + br_off[..., 0]
+        br_ys = br_ys + br_off[..., 1]
+
+        if with_centripetal_shift:
+            tl_centripetal_shift = transpose_and_gather_feat(
+                tl_centripetal_shift, tl_inds).view(batch, k, 1, 2).exp()
+            br_centripetal_shift = transpose_and_gather_feat(
+                br_centripetal_shift, br_inds).view(batch, 1, k, 2).exp()
+
+            tl_ctxs = tl_xs + tl_centripetal_shift[..., 0]
+            tl_ctys = tl_ys + tl_centripetal_shift[..., 1]
+            br_ctxs = br_xs - br_centripetal_shift[..., 0]
+            br_ctys = br_ys - br_centripetal_shift[..., 1]
+
+        # all possible boxes based on top k corners (ignoring class)
+        tl_xs *= (inp_w / width)
+        tl_ys *= (inp_h / height)
+        br_xs *= (inp_w / width)
+        br_ys *= (inp_h / height)
+
+        if with_centripetal_shift:
+            tl_ctxs *= (inp_w / width)
+            tl_ctys *= (inp_h / height)
+            br_ctxs *= (inp_w / width)
+            br_ctys *= (inp_h / height)
+
+        x_off, y_off = 0, 0  # no crop
+        if not torch.onnx.is_in_onnx_export():
+            # since `RandomCenterCropPad` is done on CPU with numpy and it's
+            # not dynamic traceable when exporting to ONNX, thus 'border'
+            # does not appears as key in 'img_meta'. As a tmp solution,
+            # we move this 'border' handle part to the postprocess after
+            # finished exporting to ONNX, which is handle in
+            # `mmdet/core/export/model_wrappers.py`. Though difference between
+            # pytorch and exported onnx model, it might be ignored since
+            # comparable performance is achieved between them (e.g. 40.4 vs
+            # 40.6 on COCO val2017, for CornerNet without test-time flip)
+            if 'border' in img_meta:
+                x_off = img_meta['border'][2]
+                y_off = img_meta['border'][0]
+
+        tl_xs -= x_off
+        tl_ys -= y_off
+        br_xs -= x_off
+        br_ys -= y_off
+
+        zeros = tl_xs.new_zeros(*tl_xs.size())
+        tl_xs = torch.where(tl_xs > 0.0, tl_xs, zeros)
+        tl_ys = torch.where(tl_ys > 0.0, tl_ys, zeros)
+        br_xs = torch.where(br_xs > 0.0, br_xs, zeros)
+        br_ys = torch.where(br_ys > 0.0, br_ys, zeros)
+
+        bboxes = torch.stack((tl_xs, tl_ys, br_xs, br_ys), dim=3)
+        area_bboxes = ((br_xs - tl_xs) * (br_ys - tl_ys)).abs()
+
+        if with_centripetal_shift:
+            tl_ctxs -= x_off
+            tl_ctys -= y_off
+            br_ctxs -= x_off
+            br_ctys -= y_off
+
+            tl_ctxs *= tl_ctxs.gt(0.0).type_as(tl_ctxs)
+            tl_ctys *= tl_ctys.gt(0.0).type_as(tl_ctys)
+            br_ctxs *= br_ctxs.gt(0.0).type_as(br_ctxs)
+            br_ctys *= br_ctys.gt(0.0).type_as(br_ctys)
+
+            ct_bboxes = torch.stack((tl_ctxs, tl_ctys, br_ctxs, br_ctys),
+                                    dim=3)
+            area_ct_bboxes = ((br_ctxs - tl_ctxs) * (br_ctys - tl_ctys)).abs()
+
+            rcentral = torch.zeros_like(ct_bboxes)
+            # magic nums from paper section 4.1
+            mu = torch.ones_like(area_bboxes) / 2.4
+            mu[area_bboxes > 3500] = 1 / 2.1  # large bbox have smaller mu
+
+            bboxes_center_x = (bboxes[..., 0] + bboxes[..., 2]) / 2
+            bboxes_center_y = (bboxes[..., 1] + bboxes[..., 3]) / 2
+            rcentral[..., 0] = bboxes_center_x - mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 1] = bboxes_center_y - mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            rcentral[..., 2] = bboxes_center_x + mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 3] = bboxes_center_y + mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            area_rcentral = ((rcentral[..., 2] - rcentral[..., 0]) *
+                             (rcentral[..., 3] - rcentral[..., 1])).abs()
+            dists = area_ct_bboxes / area_rcentral
+
+            tl_ctx_inds = (ct_bboxes[..., 0] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 0] >= rcentral[..., 2])
+            tl_cty_inds = (ct_bboxes[..., 1] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 1] >= rcentral[..., 3])
+            br_ctx_inds = (ct_bboxes[..., 2] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 2] >= rcentral[..., 2])
+            br_cty_inds = (ct_bboxes[..., 3] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 3] >= rcentral[..., 3])
+
+        if with_embedding:
+            tl_emb = transpose_and_gather_feat(tl_emb, tl_inds)
+            tl_emb = tl_emb.view(batch, k, 1)
+            br_emb = transpose_and_gather_feat(br_emb, br_inds)
+            br_emb = br_emb.view(batch, 1, k)
+            dists = torch.abs(tl_emb - br_emb)
+
+        tl_scores = tl_scores.view(batch, k, 1).repeat(1, 1, k)
+        br_scores = br_scores.view(batch, 1, k).repeat(1, k, 1)
+
+        scores = (tl_scores + br_scores) / 2  # scores for all possible boxes
+
+        # tl and br should have same class
+        tl_clses = tl_clses.view(batch, k, 1).repeat(1, 1, k)
+        br_clses = br_clses.view(batch, 1, k).repeat(1, k, 1)
+        cls_inds = (tl_clses != br_clses)
+
+        # reject boxes based on distances
+        dist_inds = dists > distance_threshold
+
+        # reject boxes based on widths and heights
+        width_inds = (br_xs <= tl_xs)
+        height_inds = (br_ys <= tl_ys)
+
+        # No use `scores[cls_inds]`, instead we use `torch.where` here.
+        # Since only 1-D indices with type 'tensor(bool)' are supported
+        # when exporting to ONNX, any other bool indices with more dimensions
+        # (e.g. 2-D bool tensor) as input parameter in node is invalid
+        negative_scores = -1 * torch.ones_like(scores)
+        scores = torch.where(cls_inds, negative_scores, scores)
+        scores = torch.where(width_inds, negative_scores, scores)
+        scores = torch.where(height_inds, negative_scores, scores)
+        scores = torch.where(dist_inds, negative_scores, scores)
+
+        if with_centripetal_shift:
+            scores[tl_ctx_inds] = -1
+            scores[tl_cty_inds] = -1
+            scores[br_ctx_inds] = -1
+            scores[br_cty_inds] = -1
+
+        scores = scores.view(batch, -1)
+        scores, inds = torch.topk(scores, num_dets)
+        scores = scores.unsqueeze(2)
+
+        bboxes = bboxes.view(batch, -1, 4)
+        bboxes = gather_feat(bboxes, inds)
+
+        clses = tl_clses.contiguous().view(batch, -1, 1)
+        clses = gather_feat(clses, inds).float()
+
+        return bboxes, scores, clses
+
+    def onnx_export(self,
+                    tl_heats,
+                    br_heats,
+                    tl_embs,
+                    br_embs,
+                    tl_offs,
+                    br_offs,
+                    img_metas,
+                    rescale=False,
+                    with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor, Tensor]: First tensor bboxes with shape
+            [N, num_det, 5], 5 arrange as (x1, y1, x2, y2, score)
+            and second element is class labels of shape [N, num_det].
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
+            img_metas) == 1
+        result_list = []
+        for img_id in range(len(img_metas)):
+            result_list.append(
+                self._get_bboxes_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    img_metas[img_id],
+                    tl_emb=tl_embs[-1][img_id:img_id + 1, :],
+                    br_emb=br_embs[-1][img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        detections, labels = result_list[0]
+        # batch_size 1 here, [1, num_det, 5], [1, num_det]
+        return detections.unsqueeze(0), labels.unsqueeze(0)
diff --git a/mmdet/models/dense_heads/ddod_head.py b/mmdet/models/dense_heads/ddod_head.py
new file mode 100755
index 0000000..b2ff223
--- /dev/null
+++ b/mmdet/models/dense_heads/ddod_head.py
@@ -0,0 +1,778 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_assigner, build_sampler,
+                        images_to_levels, multi_apply, reduce_mean, unmap)
+from mmdet.core.bbox import bbox_overlaps
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+EPS = 1e-12
+
+
+@HEADS.register_module()
+class DDODHead(AnchorHead):
+    """DDOD head decomposes conjunctions lying in most current one-stage
+    detectors via label assignment disentanglement, spatial feature
+    disentanglement, and pyramid supervision disentanglement.
+
+    https://arxiv.org/abs/2107.02963
+
+    Args:
+        num_classes (int): Number of categories excluding the
+            background category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): The number of stacked Conv. Default: 4.
+        conv_cfg (dict): Conv config of ddod head. Default: None.
+        use_dcn (bool): Use dcn, Same as ATSS when False. Default: True.
+        norm_cfg (dict): Normal config of ddod head. Default:
+            dict(type='GN', num_groups=32, requires_grad=True).
+        loss_iou (dict): Config of IoU loss. Default:
+            dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 use_dcn=True,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 loss_iou=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.use_dcn = use_dcn
+        super(DDODHead, self).__init__(num_classes, in_channels, **kwargs)
+
+        self.sampling = False
+        if self.train_cfg:
+            self.cls_assigner = build_assigner(self.train_cfg.assigner)
+            self.reg_assigner = build_assigner(self.train_cfg.reg_assigner)
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.loss_iou = build_loss(loss_iou)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.atss_iou = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 1, 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+        # we use the global list in loss
+        self.cls_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+        self.reg_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        normal_init(self.atss_reg, std=0.01)
+        normal_init(self.atss_iou, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.atss_cls, std=0.01, bias=bias_cls)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_base_priors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_base_priors * 4.
+                iou_preds (list[Tensor]): IoU scores for all scale levels,
+                    each is a 4D-tensor, the channels number is
+                    num_base_priors * 1.
+        """
+        return multi_apply(self.forward_single, feats, self.scales)
+
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                - cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for a single \
+                    scale level, the channels number is num_base_priors * 4.
+                - iou_pred (Tensor): Iou for a single scale level, the \
+                    channel number is (N, num_base_priors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        iou_pred = self.atss_iou(reg_feat)
+        return cls_score, bbox_pred, iou_pred
+
+    def loss_cls_single(self, cls_score, labels, label_weights,
+                        reweight_factor, num_total_samples):
+        """Compute cls loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            reweight_factor (list[int]): Reweight factor for cls and reg
+                loss.
+            num_total_samples (int): Number of positive samples that is
+                reduced over all GPUs.
+
+        Returns:
+            tuple[Tensor]: A tuple of loss components.
+        """
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+        return reweight_factor * loss_cls,
+
+    def loss_reg_single(self, anchors, bbox_pred, iou_pred, labels,
+                        label_weights, bbox_targets, bbox_weights,
+                        reweight_factor, num_total_samples):
+        """Compute reg loss of a single scale level.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W).
+            iou_pred (Tensor): Iou for a single scale level, the
+                channel number is (N, num_base_priors * 1, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox weights of all anchors in the
+                image with shape (N, 4)
+            reweight_factor (list[int]): Reweight factor for cls and reg
+                loss.
+            num_total_samples (int): Number of positive samples that is
+                reduced over all GPUs.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        anchors = anchors.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        iou_pred = iou_pred.permute(0, 2, 3, 1).reshape(-1, )
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        iou_targets = label_weights.new_zeros(labels.shape)
+        iou_weights = label_weights.new_zeros(labels.shape)
+        iou_weights[(bbox_weights.sum(axis=1) > 0).nonzero(
+            as_tuple=False)] = 1.
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    &
+                    (labels < bg_class_ind)).nonzero(as_tuple=False).squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+            pos_decode_bbox_targets = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_targets)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                avg_factor=num_total_samples)
+
+            iou_targets[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            loss_iou = self.loss_iou(
+                iou_pred,
+                iou_targets,
+                iou_weights,
+                avg_factor=num_total_samples)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_iou = iou_pred.sum() * 0
+
+        return reweight_factor * loss_bbox, reweight_factor * loss_iou
+
+    def calc_reweight_factor(self, labels_list):
+        """Compute reweight_factor for regression and classification loss."""
+        # get pos samples for each level
+        bg_class_ind = self.num_classes
+        for ii, each_level_label in enumerate(labels_list):
+            pos_inds = ((each_level_label >= 0) &
+                        (each_level_label < bg_class_ind)).nonzero(
+                            as_tuple=False).squeeze(1)
+            self.cls_num_pos_samples_per_level[ii] += len(pos_inds)
+        # get reweight factor from 1 ~ 2 with bilinear interpolation
+        min_pos_samples = min(self.cls_num_pos_samples_per_level)
+        max_pos_samples = max(self.cls_num_pos_samples_per_level)
+        interval = 1. / (max_pos_samples - min_pos_samples + 1e-10)
+        reweight_factor_per_level = []
+        for pos_samples in self.cls_num_pos_samples_per_level:
+            factor = 2. - (pos_samples - min_pos_samples) * interval
+            reweight_factor_per_level.append(factor)
+        return reweight_factor_per_level
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'iou_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             iou_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W)
+            iou_preds (list[Tensor]): Score factor for all scale level,
+                each is a 4D-tensor, has shape (batch_size, 1, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        # calculate common vars for cls and reg assigners at once
+        targets_com = self.process_predictions_and_anchors(
+            anchor_list, valid_flag_list, cls_scores, bbox_preds, img_metas,
+            gt_bboxes_ignore)
+        (anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list,
+         bbox_pred_list, gt_bboxes_ignore_list) = targets_com
+
+        # classification branch assigner
+        cls_targets = self.get_cls_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore_list,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_targets is None:
+            return None
+
+        (cls_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos, dtype=torch.float,
+                         device=device)).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        cls_losses_cls, = multi_apply(
+            self.loss_cls_single,
+            cls_scores,
+            labels_list,
+            label_weights_list,
+            reweight_factor_per_level,
+            num_total_samples=num_total_samples)
+
+        # regression branch assigner
+        reg_targets = self.get_reg_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore_list,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if reg_targets is None:
+            return None
+
+        (reg_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = reg_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos, dtype=torch.float,
+                         device=device)).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        reg_losses_bbox, reg_losses_iou = multi_apply(
+            self.loss_reg_single,
+            reg_anchor_list,
+            bbox_preds,
+            iou_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            reweight_factor_per_level,
+            num_total_samples=num_total_samples)
+
+        return dict(
+            loss_cls=cls_losses_cls,
+            loss_bbox=reg_losses_bbox,
+            loss_iou=reg_losses_iou)
+
+    def process_predictions_and_anchors(self, anchor_list, valid_flag_list,
+                                        cls_scores, bbox_preds, img_metas,
+                                        gt_bboxes_ignore_list):
+        """Compute common vars for regression and classification targets.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            cls_scores (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore_list (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Return:
+            tuple[Tensor]: A tuple of common loss vars.
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        anchor_list_ = []
+        valid_flag_list_ = []
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list_.append(torch.cat(anchor_list[i]))
+            valid_flag_list_.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+
+        num_levels = len(cls_scores)
+        cls_score_list = []
+        bbox_pred_list = []
+
+        mlvl_cls_score_list = [
+            cls_score.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.num_base_priors * self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        mlvl_bbox_pred_list = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_base_priors * 4)
+            for bbox_pred in bbox_preds
+        ]
+
+        for i in range(num_imgs):
+            mlvl_cls_tensor_list = [
+                mlvl_cls_score_list[j][i] for j in range(num_levels)
+            ]
+            mlvl_bbox_tensor_list = [
+                mlvl_bbox_pred_list[j][i] for j in range(num_levels)
+            ]
+            cat_mlvl_cls_score = torch.cat(mlvl_cls_tensor_list, dim=0)
+            cat_mlvl_bbox_pred = torch.cat(mlvl_bbox_tensor_list, dim=0)
+            cls_score_list.append(cat_mlvl_cls_score)
+            bbox_pred_list.append(cat_mlvl_bbox_pred)
+        return (anchor_list_, valid_flag_list_, num_level_anchors_list,
+                cls_score_list, bbox_pred_list, gt_bboxes_ignore_list)
+
+    def get_cls_targets(self,
+                        anchor_list,
+                        valid_flag_list,
+                        num_level_anchors_list,
+                        cls_score_list,
+                        bbox_pred_list,
+                        gt_bboxes_list,
+                        img_metas,
+                        gt_bboxes_ignore_list=None,
+                        gt_labels_list=None,
+                        label_channels=1,
+                        unmap_outputs=True):
+        """Get cls targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()`.
+        Besides returning the targets as the parent  method does,
+        it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors_list (list[Tensor]): Number of anchors of each
+                scale level of all image.
+            cls_score_list (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_pred_list (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore_list (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_labels_list (list[Tensor]): class indices corresponding to
+                each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Return:
+            tuple[Tensor]: A tuple of cls targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=True)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, num_total_pos,
+                num_total_neg)
+
+    def get_reg_targets(self,
+                        anchor_list,
+                        valid_flag_list,
+                        num_level_anchors_list,
+                        cls_score_list,
+                        bbox_pred_list,
+                        gt_bboxes_list,
+                        img_metas,
+                        gt_bboxes_ignore_list=None,
+                        gt_labels_list=None,
+                        label_channels=1,
+                        unmap_outputs=True):
+        """Get reg targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()` when
+        is_cls_assigner is False. Besides returning the targets as the parent
+        method does, it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors (int): Number of anchors of each scale level.
+            cls_scores (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            gt_labels_list (list[Tensor]): class indices corresponding to
+                each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore_list (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Return:
+            tuple[Tensor]: A tuple of reg targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=False)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, num_total_pos,
+                num_total_neg)
+
+    def _get_target_single(self,
+                           flat_anchors,
+                           valid_flags,
+                           cls_scores,
+                           bbox_preds,
+                           num_level_anchors,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=1,
+                           unmap_outputs=True,
+                           is_cls_assigner=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image,
+                which are concatenated into a single tensor of shape
+                (num_base_priors, 4).
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+            cls_scores (Tensor): Classification scores for all scale
+                levels of the image.
+            bbox_preds (Tensor): Box energies / deltas for all scale
+                levels of the image.
+            num_level_anchors (list[int]): Number of anchors of each
+                scale level.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, ).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts, ).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label. Default: 1.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Default: True.
+            is_cls_assigner (bool): Classification or regression.
+                Default: True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                - labels (Tensor): Labels of all anchors in the image with \
+                    shape (N, ).
+                - label_weights (Tensor): Label weights of all anchor in the \
+                    image with shape (N, ).
+                - bbox_targets (Tensor): BBox targets of all anchors in the \
+                    image with shape (N, 4).
+                - bbox_weights (Tensor): BBox weights of all anchors in the \
+                    image with shape (N, 4)
+                - pos_inds (Tensor): Indices of positive anchor with shape \
+                    (num_pos, ).
+                - neg_inds (Tensor): Indices of negative anchor with shape \
+                    (num_neg, ).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        bbox_preds_valid = bbox_preds[inside_flags, :]
+        cls_scores_valid = cls_scores[inside_flags, :]
+
+        assigner = self.cls_assigner if is_cls_assigner else self.reg_assigner
+
+        # decode prediction out of assigner
+        bbox_preds_valid = self.bbox_coder.decode(anchors, bbox_preds_valid)
+        assign_result = assigner.assign(anchors, num_level_anchors_inside,
+                                        gt_bboxes, gt_bboxes_ignore, gt_labels,
+                                        cls_scores_valid, bbox_preds_valid)
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if hasattr(self, 'bbox_coder'):
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                # used in VFNetHead
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        """Get the anchors of each scale level inside.
+
+        Args:
+            num_level_anchors (list[int]): Number of anchors of each
+                scale level.
+            inside_flags (Tensor): Multi level inside flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+
+        Returns:
+            list[int]: Number of anchors of each scale level inside.
+        """
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/mmdet/models/dense_heads/deformable_detr_head.py b/mmdet/models/dense_heads/deformable_detr_head.py
new file mode 100755
index 0000000..31290db
--- /dev/null
+++ b/mmdet/models/dense_heads/deformable_detr_head.py
@@ -0,0 +1,318 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Linear, bias_init_with_prob, constant_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply
+from mmdet.models.utils.transformer import inverse_sigmoid
+from ..builder import HEADS
+from .detr_head import DETRHead
+
+
+@HEADS.register_module()
+class DeformableDETRHead(DETRHead):
+    """Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
+    End Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2010.04159>`_ .
+
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 **kwargs):
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+
+        super(DeformableDETRHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+
+        fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, mlvl_feats, img_metas):
+        """Forward function.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 4D-tensor with shape
+                (N, C, H, W).
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, h). \
+                Shape [nb_dec, bs, num_query, 4].
+            enc_outputs_class (Tensor): The score of each point on encode \
+                feature map, has shape (N, h*w, num_class). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+            enc_outputs_coord (Tensor): The proposal generate from the \
+                encode feature map, has shape (N, h*w, 4). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+        """
+
+        batch_size = mlvl_feats[0].size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        img_masks = mlvl_feats[0].new_ones(
+            (batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            img_masks[img_id, :img_h, :img_w] = 0
+
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(
+                F.interpolate(img_masks[None],
+                              size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(
+                self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = None
+        if not self.as_two_stage:
+            query_embeds = self.query_embedding.weight
+        hs, init_reference, inter_references, \
+            enc_outputs_class, enc_outputs_coord = self.transformer(
+                    mlvl_feats,
+                    mlvl_masks,
+                    query_embeds,
+                    mlvl_positional_encodings,
+                    reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                    cls_branches=self.cls_branches if self.as_two_stage else None  # noqa:E501
+            )
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        if self.as_two_stage:
+            return outputs_classes, outputs_coords, \
+                enc_outputs_class, \
+                enc_outputs_coord.sigmoid()
+        else:
+            return outputs_classes, outputs_coords, \
+                None, None
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_bbox_preds'))
+    def loss(self,
+             all_cls_scores,
+             all_bbox_preds,
+             enc_cls_scores,
+             enc_bbox_preds,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification score of all
+                decoder layers, has shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds (Tensor): Sigmoid regression
+                outputs of all decode layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            enc_cls_scores (Tensor): Classification scores of
+                points on encode feature map , has shape
+                (N, h*w, num_classes). Only be passed when as_two_stage is
+                True, otherwise is None.
+            enc_bbox_preds (Tensor): Regression results of each points
+                on the encode feature map, has shape (N, h*w, 4). Only be
+                passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(img_metas))
+            ]
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list,
+                                 img_metas, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_bbox_preds'))
+    def get_bboxes(self,
+                   all_cls_scores,
+                   all_bbox_preds,
+                   enc_cls_scores,
+                   enc_bbox_preds,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores (Tensor): Classification score of all
+                decoder layers, has shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds (Tensor): Sigmoid regression
+                outputs of all decode layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            enc_cls_scores (Tensor): Classification scores of
+                points on encode feature map , has shape
+                (N, h*w, num_classes). Only be passed when as_two_stage is
+                True, otherwise is None.
+            enc_bbox_preds (Tensor): Regression results of each points
+                on the encode feature map, has shape (N, h*w, 4). Only be
+                passed when as_two_stage is True, otherwise is None.
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        cls_scores = all_cls_scores[-1]
+        bbox_preds = all_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+        return result_list
diff --git a/mmdet/models/dense_heads/dense_test_mixins.py b/mmdet/models/dense_heads/dense_test_mixins.py
new file mode 100755
index 0000000..3421548
--- /dev/null
+++ b/mmdet/models/dense_heads/dense_test_mixins.py
@@ -0,0 +1,206 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from inspect import signature
+
+import torch
+from mmcv.ops import batched_nms
+
+from mmdet.core import bbox_mapping_back, merge_aug_proposals
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin(object):
+    """Mixin class for testing det bboxes via DenseHead."""
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        outs = self.forward(feats)
+        results_list = self.get_bboxes(
+            *outs, img_metas=img_metas, rescale=rescale)
+        return results_list
+
+    def aug_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes with test time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,). The length of list should always be 1.
+        """
+        # check with_nms argument
+        gb_sig = signature(self.get_bboxes)
+        gb_args = [p.name for p in gb_sig.parameters.values()]
+        gbs_sig = signature(self._get_bboxes_single)
+        gbs_args = [p.name for p in gbs_sig.parameters.values()]
+        assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \
+            f'{self.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            outs = self.forward(x)
+            bbox_outputs = self.get_bboxes(
+                *outs,
+                img_metas=img_meta,
+                cfg=self.test_cfg,
+                rescale=False,
+                with_nms=False)[0]
+            aug_bboxes.append(bbox_outputs[0])
+            aug_scores.append(bbox_outputs[1])
+            if len(bbox_outputs) >= 3:
+                aug_labels.append(bbox_outputs[2])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0) if aug_labels else None
+
+        if merged_bboxes.numel() == 0:
+            det_bboxes = torch.cat([merged_bboxes, merged_scores[:, None]], -1)
+            return [
+                (det_bboxes, merged_labels),
+            ]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores,
+                                            merged_labels, self.test_cfg.nms)
+        det_bboxes = det_bboxes[:self.test_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][:self.test_cfg.max_per_img]
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+
+        return [
+            (_det_bboxes, det_labels),
+        ]
+
+    def simple_test_rpn(self, x, img_metas):
+        """Test without augmentation, only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        rpn_outs = self(x)
+        proposal_list = self.get_bboxes(*rpn_outs, img_metas=img_metas)
+        return proposal_list
+
+    def aug_test_rpn(self, feats, img_metas):
+        """Test with augmentation for only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                        a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        samples_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(samples_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            proposal_list = self.simple_test_rpn(x, img_meta)
+            for i, proposals in enumerate(proposal_list):
+                aug_proposals[i].append(proposals)
+        # reorganize the order of 'img_metas' to match the dimensions
+        # of 'aug_proposals'
+        aug_img_metas = []
+        for i in range(samples_per_gpu):
+            aug_img_meta = []
+            for j in range(len(img_metas)):
+                aug_img_meta.append(img_metas[j][i])
+            aug_img_metas.append(aug_img_meta)
+        # after merging, proposals will be rescaled to the original image size
+        merged_proposals = [
+            merge_aug_proposals(proposals, aug_img_meta, self.test_cfg)
+            for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas)
+        ]
+        return merged_proposals
+
+    if sys.version_info >= (3, 7):
+
+        async def async_simple_test_rpn(self, x, img_metas):
+            sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025)
+            async with completed(
+                    __name__, 'rpn_head_forward',
+                    sleep_interval=sleep_interval):
+                rpn_outs = self(x)
+
+            proposal_list = self.get_bboxes(*rpn_outs, img_metas=img_metas)
+            return proposal_list
+
+    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+            img_shapes (list[Tensor]): shape (3, ).
+
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            img_shape = img_info[0]['img_shape']
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            flip_direction = img_info[0]['flip_direction']
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                       flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
diff --git a/mmdet/models/dense_heads/detr_head.py b/mmdet/models/dense_heads/detr_head.py
new file mode 100755
index 0000000..de1913c
--- /dev/null
+++ b/mmdet/models/dense_heads/detr_head.py
@@ -0,0 +1,844 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, Linear, build_activation_layer
+from mmcv.cnn.bricks.transformer import FFN, build_positional_encoding
+from mmcv.runner import force_fp32
+
+from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh,
+                        build_assigner, build_sampler, multi_apply,
+                        reduce_mean)
+from mmdet.models.utils import build_transformer
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+
+@HEADS.register_module()
+class DETRHead(AnchorFreeHead):
+    """Implements the DETR transformer head.
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    _version = 2
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_query=100,
+                 num_reg_fcs=2,
+                 transformer=None,
+                 sync_cls_avg_factor=False,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 train_cfg=dict(
+                     assigner=dict(
+                         type='HungarianAssigner',
+                         cls_cost=dict(type='ClassificationCost', weight=1.),
+                         reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                         iou_cost=dict(
+                             type='IoUCost', iou_mode='giou', weight=2.0))),
+                 test_cfg=dict(max_per_img=100),
+                 init_cfg=None,
+                 **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is DETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
+                'The classification weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_bbox['loss_weight'] == assigner['reg_cost'][
+                'weight'], 'The regression L1 weight for loss and matcher ' \
+                'should be exactly the same.'
+            assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \
+                'The regression iou weight for loss and matcher should be' \
+                'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.embed_dims
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+            f' and {num_feats}.'
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        self.input_proj = Conv2d(
+            self.in_channels, self.embed_dims, kernel_size=1)
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        self.reg_ffn = FFN(
+            self.embed_dims,
+            self.embed_dims,
+            self.num_reg_fcs,
+            self.act_cfg,
+            dropout=0.0,
+            add_residual=False)
+        self.fc_reg = Linear(self.embed_dims, 4)
+        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get('version', None)
+        if (version is None or version < 2) and self.__class__ is DETRHead:
+            convert_dict = {
+                '.self_attn.': '.attentions.0.',
+                '.ffn.': '.ffns.0.',
+                '.multihead_attn.': '.attentions.1.',
+                '.decoder.norm.': '.decoder.post_norm.'
+            }
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(AnchorFreeHead,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single, feats, img_metas_list)
+
+    def forward_single(self, x, img_metas):
+        """"Forward function for a single feature level.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # construct binary masks which used for the transformer.
+        # NOTE following the official DETR repo, non-zero values representing
+        # ignored positions, while zero values means valid positions.
+        batch_size = x.size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        masks = x.new_ones((batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            masks[img_id, :img_h, :img_w] = 0
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(
+            masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        # position encoding
+        pos_embed = self.positional_encoding(masks)  # [bs, embed_dim, h, w]
+        # outs_dec: [nb_dec, bs, num_query, embed_dim]
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores_list,
+             all_bbox_preds_list,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # NOTE defaultly only the outputs from the last feature scale is used.
+        all_cls_scores = all_cls_scores_list[-1]
+        all_bbox_preds = all_bbox_preds_list[-1]
+        assert gt_bboxes_ignore is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           img_metas, gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single, cls_scores_list, bbox_preds_list,
+             gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_bboxes,
+                           gt_labels,
+                           img_meta,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            img_meta (dict): Meta information for one image.
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, img_meta,
+                                             gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    # over-write because img_metas are needed as inputs for bbox_head.
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Features from backbone.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        outs = self(x, img_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores_list,
+                   all_bbox_preds_list,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        # NOTE defaultly only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score,
+                           bbox_pred,
+                           img_shape,
+                           scale_factor,
+                           rescale=False):
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_query, 4].
+            img_shape (tuple[int]): Shape of input image, (height, width, 3).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default False.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels.
+
+                - det_bboxes: Predicted bboxes with shape [num_query, 5], \
+                    where the first 4 columns are bounding box positions \
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
+                    between 0 and 1.
+                - det_labels: Predicted labels of the corresponding box with \
+                    shape [num_query].
+        """
+        assert len(cls_score) == len(bbox_pred)
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
+
+        return det_bboxes, det_labels
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        # forward of this head requires img_metas
+        outs = self.forward(feats, img_metas)
+        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
+        return results_list
+
+    def forward_onnx(self, feats, img_metas):
+        """Forward function for exporting to ONNX.
+
+        Over-write `forward` because: `masks` is directly created with
+        zero (valid position tag) and has the same spatial size as `x`.
+        Thus the construction of `masks` is different from that in `forward`.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single_onnx, feats, img_metas_list)
+
+    def forward_single_onnx(self, x, img_metas):
+        """"Forward function for a single feature level with ONNX exportation.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # since the related augmentation was done with numpy under
+        # CPU. Thus `masks` is directly created with zeros (valid tag)
+        # and the same spatial shape as `x`.
+        # The difference between torch and exported ONNX model may be
+        # ignored, since the same performance is achieved (e.g.
+        # 40.1 vs 40.1 for DETR)
+        batch_size = x.size(0)
+        h, w = x.size()[-2:]
+        masks = x.new_zeros((batch_size, h, w))  # [B,h,w]
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(
+            masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        pos_embed = self.positional_encoding(masks)
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    def onnx_export(self, all_cls_scores_list, all_bbox_preds_list, img_metas):
+        """Transform network outputs into bbox predictions, with ONNX
+        exportation.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        assert len(img_metas) == 1, \
+            'Only support one input image while in exporting to ONNX'
+
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # here `img_shape_for_onnx` (padded shape of image tensor)
+        # is used.
+        img_shape = img_metas[0]['img_shape_for_onnx']
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        batch_size = cls_scores.size(0)
+        # `batch_index_offset` is used for the gather of concatenated tensor
+        batch_index_offset = torch.arange(batch_size).to(
+            cls_scores.device) * max_per_img
+        batch_index_offset = batch_index_offset.unsqueeze(1).expand(
+            batch_size, max_per_img)
+
+        # supports dynamical batch inference
+        if self.loss_cls.use_sigmoid:
+            cls_scores = cls_scores.sigmoid()
+            scores, indexes = cls_scores.view(batch_size, -1).topk(
+                max_per_img, dim=1)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+        else:
+            scores, det_labels = F.softmax(
+                cls_scores, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img, dim=1)
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            det_labels = det_labels.view(-1)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+            det_labels = det_labels.view(batch_size, -1)
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_preds)
+        # use `img_shape_tensor` for dynamically exporting to ONNX
+        img_shape_tensor = img_shape.flip(0).repeat(2)  # [w,h,w,h]
+        img_shape_tensor = img_shape_tensor.unsqueeze(0).unsqueeze(0).expand(
+            batch_size, det_bboxes.size(1), 4)
+        det_bboxes = det_bboxes * img_shape_tensor
+        # dynamically clip bboxes
+        x1, y1, x2, y2 = det_bboxes.split((1, 1, 1, 1), dim=-1)
+        from mmdet.core.export import dynamic_clip_for_onnx
+        x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, img_shape)
+        det_bboxes = torch.cat([x1, y1, x2, y2], dim=-1)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(-1)), -1)
+
+        return det_bboxes, det_labels
diff --git a/mmdet/models/dense_heads/embedding_rpn_head.py b/mmdet/models/dense_heads/embedding_rpn_head.py
new file mode 100755
index 0000000..22060b9
--- /dev/null
+++ b/mmdet/models/dense_heads/embedding_rpn_head.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.runner import BaseModule
+
+from mmdet.models.builder import HEADS
+from ...core import bbox_cxcywh_to_xyxy
+
+
+@HEADS.register_module()
+class EmbeddingRPNHead(BaseModule):
+    """RPNHead in the `Sparse R-CNN <https://arxiv.org/abs/2011.12450>`_ .
+
+    Unlike traditional RPNHead, this module does not need FPN input, but just
+    decode `init_proposal_bboxes` and expand the first dimension of
+    `init_proposal_bboxes` and `init_proposal_features` to the batch_size.
+
+    Args:
+        num_proposals (int): Number of init_proposals. Default 100.
+        proposal_feature_channel (int): Channel number of
+            init_proposal_feature. Defaults to 256.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_proposals=100,
+                 proposal_feature_channel=256,
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(EmbeddingRPNHead, self).__init__(init_cfg)
+        self.num_proposals = num_proposals
+        self.proposal_feature_channel = proposal_feature_channel
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize a sparse set of proposal boxes and proposal features."""
+        self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)
+        self.init_proposal_features = nn.Embedding(
+            self.num_proposals, self.proposal_feature_channel)
+
+    def init_weights(self):
+        """Initialize the init_proposal_bboxes as normalized.
+
+        [c_x, c_y, w, h], and we initialize it to the size of  the entire
+        image.
+        """
+        super(EmbeddingRPNHead, self).init_weights()
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, :2], 0.5)
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, 2:], 1)
+
+    def _decode_init_proposals(self, imgs, img_metas):
+        """Decode init_proposal_bboxes according to the size of images and
+        expand dimension of init_proposal_features to batch_size.
+
+        Args:
+            imgs (list[Tensor]): List of FPN features.
+            img_metas (list[dict]): List of meta-information of
+                images. Need the img_shape to decode the init_proposals.
+
+        Returns:
+            Tuple(Tensor):
+
+                - proposals (Tensor): Decoded proposal bboxes,
+                  has shape (batch_size, num_proposals, 4).
+                - init_proposal_features (Tensor): Expanded proposal
+                  features, has shape
+                  (batch_size, num_proposals, proposal_feature_channel).
+                - imgs_whwh (Tensor): Tensor with shape
+                  (batch_size, 4), the dimension means
+                  [img_width, img_height, img_width, img_height].
+        """
+        proposals = self.init_proposal_bboxes.weight.clone()
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+        num_imgs = len(imgs[0])
+        imgs_whwh = []
+        for meta in img_metas:
+            h, w, _ = meta['img_shape']
+            imgs_whwh.append(imgs[0].new_tensor([[w, h, w, h]]))
+        imgs_whwh = torch.cat(imgs_whwh, dim=0)
+        imgs_whwh = imgs_whwh[:, None, :]
+
+        # imgs_whwh has shape (batch_size, 1, 4)
+        # The shape of proposals change from (num_proposals, 4)
+        # to (batch_size ,num_proposals, 4)
+        proposals = proposals * imgs_whwh
+
+        init_proposal_features = self.init_proposal_features.weight.clone()
+        init_proposal_features = init_proposal_features[None].expand(
+            num_imgs, *init_proposal_features.size())
+        return proposals, init_proposal_features, imgs_whwh
+
+    def forward_dummy(self, img, img_metas):
+        """Dummy forward function.
+
+        Used in flops calculation.
+        """
+        return self._decode_init_proposals(img, img_metas)
+
+    def forward_train(self, img, img_metas):
+        """Forward function in training stage."""
+        return self._decode_init_proposals(img, img_metas)
+
+    def simple_test_rpn(self, img, img_metas):
+        """Forward function in testing stage."""
+        return self._decode_init_proposals(img, img_metas)
+
+    def simple_test(self, img, img_metas):
+        """Forward function in testing stage."""
+        raise NotImplementedError
+
+    def aug_test_rpn(self, feats, img_metas):
+        raise NotImplementedError(
+            'EmbeddingRPNHead does not support test-time augmentation')
diff --git a/mmdet/models/dense_heads/fcos_head.py b/mmdet/models/dense_heads/fcos_head.py
new file mode 100755
index 0000000..d72fb56
--- /dev/null
+++ b/mmdet/models/dense_heads/fcos_head.py
@@ -0,0 +1,455 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply, reduce_mean
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+@HEADS.register_module()
+class FCOSHead(AnchorFreeHead):
+    """Anchor-free head used in `FCOS <https://arxiv.org/abs/1904.01355>`_.
+
+    The FCOS head does not use anchor boxes. Instead bounding boxes are
+    predicted at each pixel and a centerness measure is used to suppress
+    low-quality predictions.
+    Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training
+    tricks used in official repo, which will bring remarkable mAP gains
+    of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for
+    more detail.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (list[int] | list[tuple[int, int]]): Strides of points
+            in multiple feature levels. Default: (4, 8, 16, 32, 64).
+        regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Default: False.
+        center_sample_radius (float): Radius of center sampling. Default: 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets
+            with FPN strides. Default: False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_centerness (dict): Config of centerness loss.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> self = FCOSHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, centerness = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
+                                 (512, INF)),
+                 center_sampling=False,
+                 center_sample_radius=1.5,
+                 norm_on_bbox=False,
+                 centerness_on_reg=False,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+                 loss_centerness=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_centerness = build_loss(loss_centerness)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level, \
+                    each is a 4D-tensor, the channel number is \
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each \
+                    scale level, each is a 4D-tensor, the channel number is \
+                    num_points * 4.
+                centernesses (list[Tensor]): centerness for each scale level, \
+                    each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, feats, self.scales,
+                           self.strides)
+
+    def forward_single(self, x, scale, stride):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions and centerness \
+                predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super().forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        return cls_score, bbox_pred, centerness
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             centernesses,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        labels, bbox_targets = self.get_targets(all_level_points, gt_bboxes,
+                                                gt_labels)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_centerness=loss_centerness)
+
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+
+        Returns:
+            tuple:
+                concat_lvl_labels (list[Tensor]): Labels of each level. \
+                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+                    level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list = multi_apply(
+            self._get_target_single,
+            gt_bboxes_list,
+            gt_labels_list,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return concat_lvl_labels, concat_lvl_bbox_targets
+
+    def _get_target_single(self, gt_bboxes, gt_labels, points, regress_ranges,
+                           num_points_per_lvl):
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = gt_labels.size(0)
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4))
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+            center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        return labels, bbox_targets
+
+    def centerness_target(self, pos_bbox_targets):
+        """Compute centerness targets.
+
+        Args:
+            pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape
+                (num_pos, 4)
+
+        Returns:
+            Tensor: Centerness target.
+        """
+        # only calculate pos centerness targets, otherwise there may be nan
+        left_right = pos_bbox_targets[:, [0, 2]]
+        top_bottom = pos_bbox_targets[:, [1, 3]]
+        if len(left_right) == 0:
+            centerness_targets = left_right[..., 0]
+        else:
+            centerness_targets = (
+                left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+                    top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+        return torch.sqrt(centerness_targets)
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points according to feature map size.
+
+        This function will be deprecated soon.
+        """
+        warnings.warn(
+            '`_get_points_single` in `FCOSHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of a single level feature map '
+            'with `self.prior_generator.single_level_grid_priors` ')
+
+        y, x = super()._get_points_single(featmap_size, stride, dtype, device)
+        points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
+                             dim=-1) + stride // 2
+        return points
diff --git a/mmdet/models/dense_heads/fovea_head.py b/mmdet/models/dense_heads/fovea_head.py
new file mode 100755
index 0000000..8be7fc9
--- /dev/null
+++ b/mmdet/models/dense_heads/fovea_head.py
@@ -0,0 +1,385 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmcv.runner import BaseModule
+
+from mmdet.core import multi_apply
+from mmdet.core.utils import filter_scores_and_topk
+from ..builder import HEADS
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+class FeatureAlign(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 deform_groups=4,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.1,
+                     override=dict(
+                         type='Normal', name='conv_adaption', std=0.01))):
+        super(FeatureAlign, self).__init__(init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            4, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, shape):
+        offset = self.conv_offset(shape)
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@HEADS.register_module()
+class FoveaHead(AnchorFreeHead):
+    """FoveaBox: Beyond Anchor-based Object Detector
+    https://arxiv.org/abs/1904.03797
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 base_edge_list=(16, 32, 64, 128, 256),
+                 scale_ranges=((8, 32), (16, 64), (32, 128), (64, 256), (128,
+                                                                         512)),
+                 sigma=0.4,
+                 with_deform=False,
+                 deform_groups=4,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        self.base_edge_list = base_edge_list
+        self.scale_ranges = scale_ranges
+        self.sigma = sigma
+        self.with_deform = with_deform
+        self.deform_groups = deform_groups
+        super().__init__(num_classes, in_channels, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        # box branch
+        super()._init_reg_convs()
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+        # cls branch
+        if not self.with_deform:
+            super()._init_cls_convs()
+            self.conv_cls = nn.Conv2d(
+                self.feat_channels, self.cls_out_channels, 3, padding=1)
+        else:
+            self.cls_convs = nn.ModuleList()
+            self.cls_convs.append(
+                ConvModule(
+                    self.feat_channels, (self.feat_channels * 4),
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+            self.cls_convs.append(
+                ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
+                           1,
+                           stride=1,
+                           padding=0,
+                           conv_cfg=self.conv_cfg,
+                           norm_cfg=self.norm_cfg,
+                           bias=self.norm_cfg is None))
+            self.feature_adaption = FeatureAlign(
+                self.feat_channels,
+                self.feat_channels,
+                kernel_size=3,
+                deform_groups=self.deform_groups)
+            self.conv_cls = nn.Conv2d(
+                int(self.feat_channels * 4),
+                self.cls_out_channels,
+                3,
+                padding=1)
+
+    def forward_single(self, x):
+        cls_feat = x
+        reg_feat = x
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        if self.with_deform:
+            cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+        return cls_score, bbox_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bbox_list,
+             gt_label_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        assert len(cls_scores) == len(bbox_preds)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        num_imgs = cls_scores[0].size(0)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_labels, flatten_bbox_targets = self.get_targets(
+            gt_bbox_list, gt_label_list, featmap_sizes, points)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < self.num_classes)).nonzero().view(-1)
+        num_pos = len(pos_inds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
+        if num_pos > 0:
+            pos_bbox_preds = flatten_bbox_preds[pos_inds]
+            pos_bbox_targets = flatten_bbox_targets[pos_inds]
+            pos_weights = pos_bbox_targets.new_zeros(
+                pos_bbox_targets.size()) + 1.0
+            loss_bbox = self.loss_bbox(
+                pos_bbox_preds,
+                pos_bbox_targets,
+                pos_weights,
+                avg_factor=num_pos)
+        else:
+            loss_bbox = torch.tensor(
+                0,
+                dtype=flatten_bbox_preds.dtype,
+                device=flatten_bbox_preds.device)
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(self, gt_bbox_list, gt_label_list, featmap_sizes, points):
+        label_list, bbox_target_list = multi_apply(
+            self._get_target_single,
+            gt_bbox_list,
+            gt_label_list,
+            featmap_size_list=featmap_sizes,
+            point_list=points)
+        flatten_labels = [
+            torch.cat([
+                labels_level_img.flatten() for labels_level_img in labels_level
+            ]) for labels_level in zip(*label_list)
+        ]
+        flatten_bbox_targets = [
+            torch.cat([
+                bbox_targets_level_img.reshape(-1, 4)
+                for bbox_targets_level_img in bbox_targets_level
+            ]) for bbox_targets_level in zip(*bbox_target_list)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+        flatten_bbox_targets = torch.cat(flatten_bbox_targets)
+        return flatten_labels, flatten_bbox_targets
+
+    def _get_target_single(self,
+                           gt_bboxes_raw,
+                           gt_labels_raw,
+                           featmap_size_list=None,
+                           point_list=None):
+
+        gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
+                              (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
+        label_list = []
+        bbox_target_list = []
+        # for each pyramid, find the cls and box target
+        for base_len, (lower_bound, upper_bound), stride, featmap_size, \
+            points in zip(self.base_edge_list, self.scale_ranges,
+                          self.strides, featmap_size_list, point_list):
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            points = points.view(*featmap_size, 2)
+            x, y = points[..., 0], points[..., 1]
+            labels = gt_labels_raw.new_zeros(featmap_size) + self.num_classes
+            bbox_targets = gt_bboxes_raw.new(featmap_size[0], featmap_size[1],
+                                             4) + 1
+            # scale assignment
+            hit_indices = ((gt_areas >= lower_bound) &
+                           (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(hit_indices) == 0:
+                label_list.append(labels)
+                bbox_target_list.append(torch.log(bbox_targets))
+                continue
+            _, hit_index_order = torch.sort(-gt_areas[hit_indices])
+            hit_indices = hit_indices[hit_index_order]
+            gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
+            gt_labels = gt_labels_raw[hit_indices]
+            half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
+            half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            # valid fovea area: left, right, top, down
+            pos_left = torch.ceil(
+                gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_right = torch.floor(
+                gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_top = torch.ceil(
+                gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            pos_down = torch.floor(
+                gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
+                    zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
+                        gt_bboxes_raw[hit_indices, :]):
+                labels[py1:py2 + 1, px1:px2 + 1] = label
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
+                    (x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
+                    (y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
+                    (gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
+                    (gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len
+            bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
+            label_list.append(labels)
+            bbox_target_list.append(torch.log(bbox_targets))
+        return label_list, bbox_target_list
+
+    # Same as base_dense_head/_get_bboxes_single except self._bbox_decode
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_priors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. Fovea head does not need this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list, self.strides,
+                              self.base_edge_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        return self._bbox_post_process(mlvl_scores, mlvl_labels, mlvl_bboxes,
+                                       img_meta['scale_factor'], cfg, rescale,
+                                       with_nms)
+
+    def _bbox_decode(self, priors, bbox_pred, base_len, max_shape):
+        bbox_pred = bbox_pred.exp()
+
+        y = priors[:, 1]
+        x = priors[:, 0]
+        x1 = (x - base_len * bbox_pred[:, 0]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y1 = (y - base_len * bbox_pred[:, 1]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        x2 = (x + base_len * bbox_pred[:, 2]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y2 = (y + base_len * bbox_pred[:, 3]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], -1)
+        return decoded_bboxes
+
+    def _get_points_single(self, *args, **kwargs):
+        """Get points according to feature map size.
+
+        This function will be deprecated soon.
+        """
+        warnings.warn(
+            '`_get_points_single` in `FoveaHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of a single level feature map '
+            'with `self.prior_generator.single_level_grid_priors` ')
+        y, x = super()._get_points_single(*args, **kwargs)
+        return y + 0.5, x + 0.5
diff --git a/mmdet/models/dense_heads/free_anchor_retina_head.py b/mmdet/models/dense_heads/free_anchor_retina_head.py
new file mode 100755
index 0000000..3acd25e
--- /dev/null
+++ b/mmdet/models/dense_heads/free_anchor_retina_head.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmdet.core import bbox_overlaps
+from ..builder import HEADS
+from .retina_head import RetinaHead
+
+EPS = 1e-12
+
+
+@HEADS.register_module()
+class FreeAnchorRetinaHead(RetinaHead):
+    """FreeAnchor RetinaHead used in https://arxiv.org/abs/1909.02466.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Default: 4.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32,
+            requires_grad=True).
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+        bbox_thr (float): The threshold of the saturated linear function. It is
+            usually the same with the IoU threshold used in NMS.
+        gamma (float): Gamma parameter in focal loss.
+        alpha (float): Alpha parameter in focal loss.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 pre_anchor_topk=50,
+                 bbox_thr=0.6,
+                 gamma=2.0,
+                 alpha=0.5,
+                 **kwargs):
+        super(FreeAnchorRetinaHead,
+              self).__init__(num_classes, in_channels, stacked_convs, conv_cfg,
+                             norm_cfg, **kwargs)
+
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        device = cls_scores[0].device
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        anchors = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls.permute(0, 2, 3,
+                        1).reshape(cls.size(0), -1, self.cls_out_channels)
+            for cls in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(bbox_pred.size(0), -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        cls_scores = torch.cat(cls_scores, dim=1)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        cls_prob = torch.sigmoid(cls_scores)
+        box_prob = []
+        num_pos = 0
+        positive_losses = []
+        for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_,
+                bbox_preds_) in enumerate(
+                    zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds)):
+
+            with torch.no_grad():
+                if len(gt_bboxes_) == 0:
+                    image_box_prob = torch.zeros(
+                        anchors_.size(0),
+                        self.cls_out_channels).type_as(bbox_preds_)
+                else:
+                    # box_localization: a_{j}^{loc}, shape: [j, 4]
+                    pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_)
+
+                    # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                    object_box_iou = bbox_overlaps(gt_bboxes_, pred_boxes)
+
+                    # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                    t1 = self.bbox_thr
+                    t2 = object_box_iou.max(
+                        dim=1, keepdim=True).values.clamp(min=t1 + 1e-12)
+                    object_box_prob = ((object_box_iou - t1) /
+                                       (t2 - t1)).clamp(
+                                           min=0, max=1)
+
+                    # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                    num_obj = gt_labels_.size(0)
+                    indices = torch.stack([
+                        torch.arange(num_obj).type_as(gt_labels_), gt_labels_
+                    ],
+                                          dim=0)
+                    object_cls_box_prob = torch.sparse_coo_tensor(
+                        indices, object_box_prob)
+
+                    # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                    """
+                    from "start" to "end" implement:
+                    image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                     dim=0).t()
+
+                    """
+                    # start
+                    box_cls_prob = torch.sparse.sum(
+                        object_cls_box_prob, dim=0).to_dense()
+
+                    indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                    if indices.numel() == 0:
+                        image_box_prob = torch.zeros(
+                            anchors_.size(0),
+                            self.cls_out_channels).type_as(object_box_prob)
+                    else:
+                        nonzero_box_prob = torch.where(
+                            (gt_labels_.unsqueeze(dim=-1) == indices[0]),
+                            object_box_prob[:, indices[1]],
+                            torch.tensor([
+                                0
+                            ]).type_as(object_box_prob)).max(dim=0).values
+
+                        # upmap to shape [j, c]
+                        image_box_prob = torch.sparse_coo_tensor(
+                            indices.flip([0]),
+                            nonzero_box_prob,
+                            size=(anchors_.size(0),
+                                  self.cls_out_channels)).to_dense()
+                    # end
+
+                box_prob.append(image_box_prob)
+
+            # construct bags for objects
+            match_quality_matrix = bbox_overlaps(gt_bboxes_, anchors_)
+            _, matched = torch.topk(
+                match_quality_matrix,
+                self.pre_anchor_topk,
+                dim=1,
+                sorted=False)
+            del match_quality_matrix
+
+            # matched_cls_prob: P_{ij}^{cls}
+            matched_cls_prob = torch.gather(
+                cls_prob_[matched], 2,
+                gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                                 1)).squeeze(2)
+
+            # matched_box_prob: P_{ij}^{loc}
+            matched_anchors = anchors_[matched]
+            matched_object_targets = self.bbox_coder.encode(
+                matched_anchors,
+                gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors))
+            loss_bbox = self.loss_bbox(
+                bbox_preds_[matched],
+                matched_object_targets,
+                reduction_override='none').sum(-1)
+            matched_box_prob = torch.exp(-loss_bbox)
+
+            # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+            num_pos += len(gt_bboxes_)
+            positive_losses.append(
+                self.positive_bag_loss(matched_cls_prob, matched_box_prob))
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_prob = torch.stack(box_prob, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
+            1, num_pos * self.pre_anchor_topk)
+
+        # avoid the absence of gradients in regression subnet
+        # when no ground-truth in a batch
+        if num_pos == 0:
+            positive_loss = bbox_preds.sum() * 0
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_bag_loss(self, matched_cls_prob, matched_box_prob):
+        """Compute positive bag loss.
+
+        :math:`-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )`.
+
+        :math:`P_{ij}^{cls}`: matched_cls_prob, classification probability of matched samples.
+
+        :math:`P_{ij}^{loc}`: matched_box_prob, box probability of matched samples.
+
+        Args:
+            matched_cls_prob (Tensor): Classification probability of matched
+                samples in shape (num_gt, pre_anchor_topk).
+            matched_box_prob (Tensor): BBox probability of matched samples,
+                in shape (num_gt, pre_anchor_topk).
+
+        Returns:
+            Tensor: Positive bag loss in shape (num_gt,).
+        """  # noqa: E501, W605
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob, box_prob):
+        """Compute negative bag loss.
+
+        :math:`FL((1 - P_{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}))`.
+
+        :math:`P_{a_{j} \in A_{+}}`: Box_probability of matched samples.
+
+        :math:`P_{j}^{bg}`: Classification probability of negative samples.
+
+        Args:
+            cls_prob (Tensor): Classification probability, in shape
+                (num_img, num_anchors, num_classes).
+            box_prob (Tensor): Box probability, in shape
+                (num_img, num_anchors, num_classes).
+
+        Returns:
+            Tensor: Negative bag loss in shape (num_img, num_anchors, num_classes).
+        """  # noqa: E501, W605
+        prob = cls_prob * (1 - box_prob)
+        # There are some cases when neg_prob = 0.
+        # This will cause the neg_prob.log() to be inf without clamp.
+        prob = prob.clamp(min=EPS, max=1 - EPS)
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
diff --git a/mmdet/models/dense_heads/fsaf_head.py b/mmdet/models/dense_heads/fsaf_head.py
new file mode 100755
index 0000000..2d2b787
--- /dev/null
+++ b/mmdet/models/dense_heads/fsaf_head.py
@@ -0,0 +1,433 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, images_to_levels, multi_apply,
+                        unmap)
+from ..builder import HEADS
+from ..losses.accuracy import accuracy
+from ..losses.utils import weight_reduce_loss
+from .retina_head import RetinaHead
+
+
+@HEADS.register_module()
+class FSAFHead(RetinaHead):
+    """Anchor-free head used in `FSAF <https://arxiv.org/abs/1903.00621>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors (num_anchors is 1 for anchor-
+    free methods)
+
+    Args:
+        *args: Same as its base class in :class:`RetinaHead`
+        score_threshold (float, optional): The score_threshold to calculate
+            positive recall. If given, prediction scores lower than this value
+            is counted as incorrect prediction. Default to None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+        **kwargs: Same as its base class in :class:`RetinaHead`
+
+    Example:
+        >>> import torch
+        >>> self = FSAFHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == self.num_classes
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self, *args, score_threshold=None, init_cfg=None, **kwargs):
+        # The positive bias in self.retina_reg conv is to prevent predicted \
+        #  bbox with 0 area
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal', name='retina_reg', std=0.01, bias=0.25)
+                ])
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        self.score_threshold = score_threshold
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level.
+
+        Args:
+            x (Tensor): Feature map of a single scale level.
+
+        Returns:
+            tuple (Tensor):
+                cls_score (Tensor): Box scores for each scale level
+                    Has shape (N, num_points * num_classes, H, W).
+                bbox_pred (Tensor): Box energies / deltas for each scale
+                    level with shape (N, num_points * 4, H, W).
+        """
+        cls_score, bbox_pred = super().forward_single(x)
+        # relu: TBLR encoder only accepts positive bbox_pred
+        return cls_score, self.relu(bbox_pred)
+
+    def _get_targets_single(self,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Most of the codes are the same with the base class
+          :obj: `AnchorHead`, except that it also collects and returns
+          the matched gt index in the image (from 0 to num_gt-1). If the
+          anchor bbox is not matched to any gt, the corresponding value in
+          pos_gt_inds is -1.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # Assign gt and sample anchors
+        anchors = flat_anchors[inside_flags.type(torch.bool), :]
+        assign_result = self.assigner.assign(
+            anchors, gt_bboxes, gt_bboxes_ignore,
+            None if self.sampling else gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros((num_valid_anchors, label_channels),
+                                          dtype=torch.float)
+        pos_gt_inds = anchors.new_full((num_valid_anchors, ),
+                                       -1,
+                                       dtype=torch.long)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            # The assigned gt_index for each anchor. (0-based)
+            pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # shadowed_labels is a tensor composed of tuples
+        #  (anchor_inds, class_label) that indicate those anchors lying in the
+        #  outer region of a gt or overlapped by another gt with a smaller
+        #  area.
+        #
+        # Therefore, only the shadowed labels are ignored for loss calculation.
+        # the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner`
+        shadowed_labels = assign_result.get_extra_property('shadowed_labels')
+        if shadowed_labels is not None and shadowed_labels.numel():
+            if len(shadowed_labels.shape) == 2:
+                idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1]
+                assert (labels[idx_] != label_).all(), \
+                    'One label cannot be both positive and ignored'
+                label_weights[idx_, label_] = 0
+            else:
+                label_weights[shadowed_labels] = 0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(labels, num_total_anchors, inside_flags)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+            pos_gt_inds = unmap(
+                pos_gt_inds, num_total_anchors, inside_flags, fill=-1)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result, pos_gt_inds)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        for i in range(len(bbox_preds)):  # loop over fpn level
+            # avoid 0 area of the predicted bbox
+            bbox_preds[i] = bbox_preds[i].clamp(min=1e-4)
+        # TODO: It may directly use the base-class loss function.
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        batch_size = len(gt_bboxes)
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg,
+         pos_assigned_gt_inds_list) = cls_reg_targets
+
+        num_gts = np.array(list(map(len, gt_labels)))
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+
+        # `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned
+        # gt index of each anchor bbox in each fpn level.
+        cum_num_gts = list(np.cumsum(num_gts))  # length of batch_size
+        for i, assign in enumerate(pos_assigned_gt_inds_list):
+            # loop over fpn levels
+            for j in range(1, batch_size):
+                # loop over batch size
+                # Convert gt indices in each img to those in the batch
+                assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1])
+            pos_assigned_gt_inds_list[i] = assign.flatten()
+            labels_list[i] = labels_list[i].flatten()
+        num_gts = sum(map(len, gt_labels))  # total number of gt in the batch
+        # The unique label index of each gt in the batch
+        label_sequence = torch.arange(num_gts, device=device)
+        # Collect the average loss of each gt in each level
+        with torch.no_grad():
+            loss_levels, = multi_apply(
+                self.collect_loss_level_single,
+                losses_cls,
+                losses_bbox,
+                pos_assigned_gt_inds_list,
+                labels_seq=label_sequence)
+            # Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level
+            loss_levels = torch.stack(loss_levels, dim=0)
+            # Locate the best fpn level for loss back-propagation
+            if loss_levels.numel() == 0:  # zero gt
+                argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long)
+            else:
+                _, argmin = loss_levels.min(dim=0)
+
+        # Reweight the loss of each (anchor, label) pair, so that only those
+        #  at the best gt level are back-propagated.
+        losses_cls, losses_bbox, pos_inds = multi_apply(
+            self.reweight_loss_single,
+            losses_cls,
+            losses_bbox,
+            pos_assigned_gt_inds_list,
+            labels_list,
+            list(range(len(losses_cls))),
+            min_levels=argmin)
+        num_pos = torch.cat(pos_inds, 0).sum().float()
+        pos_recall = self.calculate_pos_recall(cls_scores, labels_list,
+                                               pos_inds)
+
+        if num_pos == 0:  # No gt
+            avg_factor = num_pos + float(num_total_neg)
+        else:
+            avg_factor = num_pos
+        for i in range(len(losses_cls)):
+            losses_cls[i] /= avg_factor
+            losses_bbox[i] /= avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            num_pos=num_pos / batch_size,
+            pos_recall=pos_recall)
+
+    def calculate_pos_recall(self, cls_scores, labels_list, pos_inds):
+        """Calculate positive recall with score threshold.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores at all fpn levels.
+                Each tensor is in shape (N, num_classes * num_anchors, H, W)
+            labels_list (list[Tensor]): The label that each anchor is assigned
+                to. Shape (N * H * W * num_anchors, )
+            pos_inds (list[Tensor]): List of bool tensors indicating whether
+                the anchor is assigned to a positive label.
+                Shape (N * H * W * num_anchors, )
+
+        Returns:
+            Tensor: A single float number indicating the positive recall.
+        """
+        with torch.no_grad():
+            num_class = self.num_classes
+            scores = [
+                cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos]
+                for cls, pos in zip(cls_scores, pos_inds)
+            ]
+            labels = [
+                label.reshape(-1)[pos]
+                for label, pos in zip(labels_list, pos_inds)
+            ]
+            scores = torch.cat(scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            if self.use_sigmoid_cls:
+                scores = scores.sigmoid()
+            else:
+                scores = scores.softmax(dim=1)
+
+            return accuracy(scores, labels, thresh=self.score_threshold)
+
+    def collect_loss_level_single(self, cls_loss, reg_loss, assigned_gt_inds,
+                                  labels_seq):
+        """Get the average loss in each FPN level w.r.t. each gt label.
+
+        Args:
+            cls_loss (Tensor): Classification loss of each feature map pixel,
+              shape (num_anchor, num_class)
+            reg_loss (Tensor): Regression loss of each feature map pixel,
+              shape (num_anchor, 4)
+            assigned_gt_inds (Tensor): It indicates which gt the prior is
+              assigned to (0-based, -1: no assignment). shape (num_anchor),
+            labels_seq: The rank of labels. shape (num_gt)
+
+        Returns:
+            shape: (num_gt), average loss of each gt in this level
+        """
+        if len(reg_loss.shape) == 2:  # iou loss has shape (num_prior, 4)
+            reg_loss = reg_loss.sum(dim=-1)  # sum loss in tblr dims
+        if len(cls_loss.shape) == 2:
+            cls_loss = cls_loss.sum(dim=-1)  # sum loss in class dims
+        loss = cls_loss + reg_loss
+        assert loss.size(0) == assigned_gt_inds.size(0)
+        # Default loss value is 1e6 for a layer where no anchor is positive
+        #  to ensure it will not be chosen to back-propagate gradient
+        losses_ = loss.new_full(labels_seq.shape, 1e6)
+        for i, l in enumerate(labels_seq):
+            match = assigned_gt_inds == l
+            if match.any():
+                losses_[i] = loss[match].mean()
+        return losses_,
+
+    def reweight_loss_single(self, cls_loss, reg_loss, assigned_gt_inds,
+                             labels, level, min_levels):
+        """Reweight loss values at each level.
+
+        Reassign loss values at each level by masking those where the
+        pre-calculated loss is too large. Then return the reduced losses.
+
+        Args:
+            cls_loss (Tensor): Element-wise classification loss.
+              Shape: (num_anchors, num_classes)
+            reg_loss (Tensor): Element-wise regression loss.
+              Shape: (num_anchors, 4)
+            assigned_gt_inds (Tensor): The gt indices that each anchor bbox
+              is assigned to. -1 denotes a negative anchor, otherwise it is the
+              gt index (0-based). Shape: (num_anchors, ),
+            labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ).
+            level (int): The current level index in the pyramid
+              (0-4 for RetinaNet)
+            min_levels (Tensor): The best-matching level for each gt.
+              Shape: (num_gts, ),
+
+        Returns:
+            tuple:
+                - cls_loss: Reduced corrected classification loss. Scalar.
+                - reg_loss: Reduced corrected regression loss. Scalar.
+                - pos_flags (Tensor): Corrected bool tensor indicating the
+                  final positive anchors. Shape: (num_anchors, ).
+        """
+        loc_weight = torch.ones_like(reg_loss)
+        cls_weight = torch.ones_like(cls_loss)
+        pos_flags = assigned_gt_inds >= 0  # positive pixel flag
+        pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten()
+
+        if pos_flags.any():  # pos pixels exist
+            pos_assigned_gt_inds = assigned_gt_inds[pos_flags]
+            zeroing_indices = (min_levels[pos_assigned_gt_inds] != level)
+            neg_indices = pos_indices[zeroing_indices]
+
+            if neg_indices.numel():
+                pos_flags[neg_indices] = 0
+                loc_weight[neg_indices] = 0
+                # Only the weight corresponding to the label is
+                #  zeroed out if not selected
+                zeroing_labels = labels[neg_indices]
+                assert (zeroing_labels >= 0).all()
+                cls_weight[neg_indices, zeroing_labels] = 0
+
+        # Weighted loss for both cls and reg loss
+        cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum')
+        reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum')
+
+        return cls_loss, reg_loss, pos_flags
diff --git a/mmdet/models/dense_heads/ga_retina_head.py b/mmdet/models/dense_heads/ga_retina_head.py
new file mode 100755
index 0000000..6d9e874
--- /dev/null
+++ b/mmdet/models/dense_heads/ga_retina_head.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import MaskedConv2d
+
+from ..builder import HEADS
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+
+
+@HEADS.register_module()
+class GARetinaHead(GuidedAnchorHead):
+    """Guided-Anchor-based RetinaNet head."""
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='conv_loc',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01)
+                ])
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(GARetinaHead, self).__init__(
+            num_classes, in_channels, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2,
+                                    1)
+        self.feature_adaption_cls = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.feature_adaption_reg = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.retina_cls = MaskedConv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        loc_pred = self.conv_loc(cls_feat)
+        shape_pred = self.conv_shape(reg_feat)
+
+        cls_feat = self.feature_adaption_cls(cls_feat, shape_pred)
+        reg_feat = self.feature_adaption_reg(reg_feat, shape_pred)
+
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.retina_cls(cls_feat, mask)
+        bbox_pred = self.retina_reg(reg_feat, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
diff --git a/mmdet/models/dense_heads/ga_rpn_head.py b/mmdet/models/dense_heads/ga_rpn_head.py
new file mode 100755
index 0000000..4123c8b
--- /dev/null
+++ b/mmdet/models/dense_heads/ga_rpn_head.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv import ConfigDict
+from mmcv.ops import nms
+
+from ..builder import HEADS
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@HEADS.register_module()
+class GARPNHead(GuidedAnchorHead):
+    """Guided-Anchor-based RPN head."""
+
+    def __init__(self,
+                 in_channels,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_loc',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        super(GARPNHead, self).__init__(
+            1, in_channels, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        super(GARPNHead, self)._init_layers()
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level."""
+
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        (cls_score, bbox_pred, shape_pred,
+         loc_pred) = super(GARPNHead, self).forward_single(x)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             img_metas,
+             gt_bboxes_ignore=None):
+        losses = super(GARPNHead, self).loss(
+            cls_scores,
+            bbox_preds,
+            shape_preds,
+            loc_preds,
+            gt_bboxes,
+            None,
+            img_metas,
+            gt_bboxes_ignore=gt_bboxes_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'],
+            loss_rpn_bbox=losses['loss_bbox'],
+            loss_anchor_shape=losses['loss_shape'],
+            loss_anchor_loc=losses['loss_loc'])
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           mlvl_masks,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+
+        cfg = copy.deepcopy(cfg)
+
+        # deprecate arguments warning
+        if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+            warnings.warn(
+                'In rpn_proposal or test_cfg, '
+                'nms_thr has been moved to a dict named nms as '
+                'iou_threshold, max_num has been renamed as max_per_img, '
+                'name of original arguments and the way to specify '
+                'iou_threshold of NMS will be deprecated.')
+        if 'nms' not in cfg:
+            cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+        if 'max_num' in cfg:
+            if 'max_per_img' in cfg:
+                assert cfg.max_num == cfg.max_per_img, f'You ' \
+                    f'set max_num and max_per_img at the same time, ' \
+                    f'but get {cfg.max_num} ' \
+                    f'and {cfg.max_per_img} respectively' \
+                    'Please delete max_num which will be deprecated.'
+            else:
+                cfg.max_per_img = cfg.max_num
+        if 'nms_thr' in cfg:
+            assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+                f'iou_threshold in nms and ' \
+                f'nms_thr at the same time, but get ' \
+                f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+                f' respectively. Please delete the ' \
+                f'nms_thr which will be deprecated.'
+
+        assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \
+            'naive nms.'
+
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            anchors = mlvl_anchors[idx]
+            mask = mlvl_masks[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = rpn_cls_score.softmax(dim=1)[:, :-1]
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
+                                                                   4)[mask, :]
+            if scores.dim() == 0:
+                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            # get proposals w.r.t. anchors and rpn_bbox_pred
+            proposals = self.bbox_coder.decode(
+                anchors, rpn_bbox_pred, max_shape=img_shape)
+            # filter out too small bboxes
+            if cfg.min_bbox_size >= 0:
+                w = proposals[:, 2] - proposals[:, 0]
+                h = proposals[:, 3] - proposals[:, 1]
+                valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+                if not valid_mask.all():
+                    proposals = proposals[valid_mask]
+                    scores = scores[valid_mask]
+
+            # NMS in current level
+            proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.get('nms_across_levels', False):
+            # NMS across multi levels
+            proposals, _ = nms(proposals[:, :4], proposals[:, -1],
+                               cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.max_per_img, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_per_img, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+        return proposals
diff --git a/mmdet/models/dense_heads/gfl_head.py b/mmdet/models/dense_heads/gfl_head.py
new file mode 100755
index 0000000..12eb89d
--- /dev/null
+++ b/mmdet/models/dense_heads/gfl_head.py
@@ -0,0 +1,648 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, bbox_overlaps, build_assigner,
+                        build_sampler, images_to_levels, multi_apply,
+                        reduce_mean, unmap)
+from mmdet.core.utils import filter_scores_and_topk
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+
+    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+
+    Args:
+        reg_max (int): The maximal value of the discrete set. Default: 16. You
+            may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, self.project.type_as(x)).reshape(-1, 4)
+        return x
+
+
+@HEADS.register_module()
+class GFLHead(AnchorHead):
+    """Generalized Focal Loss: Learning Qualified and Distributed Bounding
+    Boxes for Dense Object Detection.
+
+    GFL head structure is similar with ATSS, however GFL uses
+    1) joint representation for classification and localization quality, and
+    2) flexible General distribution for bounding box locations,
+    which are supervised by
+    Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively
+
+    https://arxiv.org/abs/2006.04388
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Default: 4.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='GN', num_groups=32, requires_grad=True).
+        loss_qfl (dict): Config of Quality Focal Loss (QFL).
+        bbox_coder (dict): Config of bbox coder. Defaults
+            'DistancePointBBoxCoder'.
+        reg_max (int): Max value of integral set :math: `{0, ..., reg_max}`
+            in QFL setting. Default: 16.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    Example:
+        >>> self = GFLHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_quality_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_quality_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+                 bbox_coder=dict(type='DistancePointBBoxCoder'),
+                 reg_max=16,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='gfl_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.reg_max = reg_max
+        super(GFLHead, self).__init__(
+            num_classes,
+            in_channels,
+            bbox_coder=bbox_coder,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # SSD sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.integral = Integral(self.reg_max)
+        self.loss_dfl = build_loss(loss_dfl)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        assert self.num_anchors == 1, 'anchor free version'
+        self.gfl_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.gfl_reg = nn.Conv2d(
+            self.feat_channels, 4 * (self.reg_max + 1), 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification and quality (IoU)
+                    joint scores for all scale levels, each is a 4D-tensor,
+                    the channel number is num_classes.
+                bbox_preds (list[Tensor]): Box distribution logits for all
+                    scale levels, each is a 4D-tensor, the channel number is
+                    4*(n+1), n is max value of integral set.
+        """
+        return multi_apply(self.forward_single, feats, self.scales)
+
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls and quality joint scores for a single
+                    scale level the channel number is num_classes.
+                bbox_pred (Tensor): Box distribution logits for a single scale
+                    level, the channel number is 4*(n+1), n is max value of
+                    integral set.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.gfl_cls(cls_feat)
+        bbox_pred = scale(self.gfl_reg(reg_feat)).float()
+        return cls_score, bbox_pred
+
+    def anchor_center(self, anchors):
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), "xy" format.
+        """
+        anchors_cx = (anchors[..., 2] + anchors[..., 0]) / 2
+        anchors_cy = (anchors[..., 3] + anchors[..., 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_single(self, anchors, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, stride, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            stride (tuple): Stride in this scale level.
+            num_total_samples (int): Number of positive samples that is
+                reduced over all GPUs.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=num_total_samples)
+
+        return loss_cls, loss_bbox, loss_dfl, weight_targets.sum()
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos, dtype=torch.float,
+                         device=device)).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        losses_cls, losses_bbox, losses_dfl,\
+            avg_factor = multi_apply(
+                self.loss_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                num_total_samples=num_total_samples)
+
+        avg_factor = sum(avg_factor)
+        avg_factor = reduce_mean(avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox))
+        losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl))
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl)
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_priors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. GFL head does not need this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list,
+                    self.prior_generator.strides, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert stride[0] == stride[1]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0)
+            bbox_pred = self.integral(bbox_pred) * stride[0]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self.bbox_coder.decode(
+                self.anchor_center(priors), bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        return self._bbox_post_process(
+            mlvl_scores,
+            mlvl_labels,
+            mlvl_bboxes,
+            img_meta['scale_factor'],
+            cfg,
+            rescale=rescale,
+            with_nms=with_nms)
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Get targets for GFL head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, num_total_pos,
+                num_total_neg)
+
+    def _get_target_single(self,
+                           flat_anchors,
+                           valid_flags,
+                           num_level_anchors,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=1,
+                           unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors Tensor): Number of anchors of each scale level.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                anchors (Tensor): All anchors in the image with shape (N, 4).
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4).
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        assign_result = self.assigner.assign(anchors, num_level_anchors_inside,
+                                             gt_bboxes, gt_bboxes_ignore,
+                                             gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/mmdet/models/dense_heads/guided_anchor_head.py b/mmdet/models/dense_heads/guided_anchor_head.py
new file mode 100755
index 0000000..53e8cd8
--- /dev/null
+++ b/mmdet/models/dense_heads/guided_anchor_head.py
@@ -0,0 +1,868 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.ops import DeformConv2d, MaskedConv2d
+from mmcv.runner import BaseModule, force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_assigner, build_bbox_coder,
+                        build_prior_generator, build_sampler, calc_region,
+                        images_to_levels, multi_apply, multiclass_nms, unmap)
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+class FeatureAdaption(BaseModule):
+    """Feature Adaption Module.
+
+    Feature Adaption Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Deformable conv kernel size.
+        deform_groups (int): Deformable conv group size.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 deform_groups=4,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.1,
+                     override=dict(
+                         type='Normal', name='conv_adaption', std=0.01))):
+        super(FeatureAdaption, self).__init__(init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            2, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, shape):
+        offset = self.conv_offset(shape.detach())
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@HEADS.register_module()
+class GuidedAnchorHead(AnchorHead):
+    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
+
+    This GuidedAnchorHead will predict high-quality feature guided
+    anchors and locations where anchors will be kept in inference.
+    There are mainly 3 categories of bounding-boxes.
+
+    - Sampled 9 pairs for target assignment. (approxes)
+    - The square boxes where the predicted anchors are based on. (squares)
+    - Guided anchors.
+
+    Please refer to https://arxiv.org/abs/1901.03278 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels.
+        approx_anchor_generator (dict): Config dict for approx generator
+        square_anchor_generator (dict): Config dict for square generator
+        anchor_coder (dict): Config dict for anchor coder
+        bbox_coder (dict): Config dict for bbox coder
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        deform_groups: (int): Group number of DCN in
+            FeatureAdaption module.
+        loc_filter_thr (float): Threshold to filter out unconcerned regions.
+        loss_loc (dict): Config of location loss.
+        loss_shape (dict): Config of anchor shape loss.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of bbox regression loss.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            feat_channels=256,
+            approx_anchor_generator=dict(
+                type='AnchorGenerator',
+                octave_base_scale=8,
+                scales_per_octave=3,
+                ratios=[0.5, 1.0, 2.0],
+                strides=[4, 8, 16, 32, 64]),
+            square_anchor_generator=dict(
+                type='AnchorGenerator',
+                ratios=[1.0],
+                scales=[8],
+                strides=[4, 8, 16, 32, 64]),
+            anchor_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]
+            ),
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]
+            ),
+            reg_decoded_bbox=False,
+            deform_groups=4,
+            loc_filter_thr=0.01,
+            train_cfg=None,
+            test_cfg=None,
+            loss_loc=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                           loss_weight=1.0),
+            init_cfg=dict(type='Normal', layer='Conv2d', std=0.01,
+                          override=dict(type='Normal',
+                                        name='conv_loc',
+                                        std=0.01,
+                                        bias_prob=0.01))):  # yapf: disable
+        super(AnchorHead, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.deform_groups = deform_groups
+        self.loc_filter_thr = loc_filter_thr
+
+        # build approx_anchor_generator and square_anchor_generator
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+        self.approx_anchor_generator = build_prior_generator(
+            approx_anchor_generator)
+        self.square_anchor_generator = build_prior_generator(
+            square_anchor_generator)
+        self.approxs_per_octave = self.approx_anchor_generator \
+            .num_base_priors[0]
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
+        self.sampling = loss_cls['type'] not in ['FocalLoss']
+        self.ga_sampling = train_cfg is not None and hasattr(
+            train_cfg, 'ga_sampler')
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+
+        # build bbox_coder
+        self.anchor_coder = build_bbox_coder(anchor_coder)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        # build losses
+        self.loss_loc = build_loss(loss_loc)
+        self.loss_shape = build_loss(loss_shape)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+            self.ga_assigner = build_assigner(self.train_cfg.ga_assigner)
+            if self.ga_sampling:
+                ga_sampler_cfg = self.train_cfg.ga_sampler
+            else:
+                ga_sampler_cfg = dict(type='PseudoSampler')
+            self.ga_sampler = build_sampler(ga_sampler_cfg, context=self)
+
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    @property
+    def num_anchors(self):
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'please use "num_base_priors" instead')
+        return self.square_anchor_generator.num_base_priors[0]
+
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2,
+                                    1)
+        self.feature_adaption = FeatureAdaption(
+            self.in_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.conv_cls = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * self.cls_out_channels,
+            1)
+        self.conv_reg = MaskedConv2d(self.feat_channels,
+                                     self.num_base_priors * 4, 1)
+
+    def forward_single(self, x):
+        loc_pred = self.conv_loc(x)
+        shape_pred = self.conv_shape(x)
+        x = self.feature_adaption(x, shape_pred)
+        # masked conv is only used during inference for speed-up
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.conv_cls(x, mask)
+        bbox_pred = self.conv_reg(x, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_sampled_approxs(self, featmap_sizes, img_metas, device='cuda'):
+        """Get sampled approxs and inside flags according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: approxes of each image, inside flags of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # approxes for one time
+        multi_level_approxs = self.approx_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        approxs_list = [multi_level_approxs for _ in range(num_imgs)]
+
+        # for each image, we compute inside flags of multi level approxes
+        inside_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = []
+            multi_level_approxs = approxs_list[img_id]
+
+            # obtain valid flags for each approx first
+            multi_level_approx_flags = self.approx_anchor_generator \
+                .valid_flags(featmap_sizes,
+                             img_meta['pad_shape'],
+                             device=device)
+
+            for i, flags in enumerate(multi_level_approx_flags):
+                approxs = multi_level_approxs[i]
+                inside_flags_list = []
+                for i in range(self.approxs_per_octave):
+                    split_valid_flags = flags[i::self.approxs_per_octave]
+                    split_approxs = approxs[i::self.approxs_per_octave, :]
+                    inside_flags = anchor_inside_flags(
+                        split_approxs, split_valid_flags,
+                        img_meta['img_shape'][:2],
+                        self.train_cfg.allowed_border)
+                    inside_flags_list.append(inside_flags)
+                # inside_flag for a position is true if any anchor in this
+                # position is true
+                inside_flags = (
+                    torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
+                multi_level_flags.append(inside_flags)
+            inside_flag_list.append(multi_level_flags)
+        return approxs_list, inside_flag_list
+
+    def get_anchors(self,
+                    featmap_sizes,
+                    shape_preds,
+                    loc_preds,
+                    img_metas,
+                    use_loc_filter=False,
+                    device='cuda'):
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            shape_preds (list[tensor]): Multi-level shape predictions.
+            loc_preds (list[tensor]): Multi-level location predictions.
+            img_metas (list[dict]): Image meta info.
+            use_loc_filter (bool): Use loc filter or not.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image, guided anchors of each image,
+                loc masks of each image
+        """
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        # for each image, we compute multi level guided anchors
+        guided_anchors_list = []
+        loc_mask_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_guided_anchors = []
+            multi_level_loc_mask = []
+            for i in range(num_levels):
+                squares = squares_list[img_id][i]
+                shape_pred = shape_preds[i][img_id]
+                loc_pred = loc_preds[i][img_id]
+                guided_anchors, loc_mask = self._get_guided_anchors_single(
+                    squares,
+                    shape_pred,
+                    loc_pred,
+                    use_loc_filter=use_loc_filter)
+                multi_level_guided_anchors.append(guided_anchors)
+                multi_level_loc_mask.append(loc_mask)
+            guided_anchors_list.append(multi_level_guided_anchors)
+            loc_mask_list.append(multi_level_loc_mask)
+        return squares_list, guided_anchors_list, loc_mask_list
+
+    def _get_guided_anchors_single(self,
+                                   squares,
+                                   shape_pred,
+                                   loc_pred,
+                                   use_loc_filter=False):
+        """Get guided anchors and loc masks for a single level.
+
+        Args:
+            square (tensor): Squares of a single level.
+            shape_pred (tensor): Shape predictions of a single level.
+            loc_pred (tensor): Loc predictions of a single level.
+            use_loc_filter (list[tensor]): Use loc filter or not.
+
+        Returns:
+            tuple: guided anchors, location masks
+        """
+        # calculate location filtering mask
+        loc_pred = loc_pred.sigmoid().detach()
+        if use_loc_filter:
+            loc_mask = loc_pred >= self.loc_filter_thr
+        else:
+            loc_mask = loc_pred >= 0.0
+        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors)
+        mask = mask.contiguous().view(-1)
+        # calculate guided anchors
+        squares = squares[mask]
+        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
+            -1, 2).detach()[mask]
+        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
+        bbox_deltas[:, 2:] = anchor_deltas
+        guided_anchors = self.anchor_coder.decode(
+            squares, bbox_deltas, wh_ratio_clip=1e-6)
+        return guided_anchors, mask
+
+    def ga_loc_targets(self, gt_bboxes_list, featmap_sizes):
+        """Compute location targets for guided anchoring.
+
+        Each feature map is divided into positive, negative and ignore regions.
+        - positive regions: target 1, weight 1
+        - ignore regions: target 0, weight 0
+        - negative regions: target 0, weight 0.1
+
+        Args:
+            gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
+            featmap_sizes (list[tuple]): Multi level sizes of each feature
+                maps.
+
+        Returns:
+            tuple
+        """
+        anchor_scale = self.approx_anchor_generator.octave_base_scale
+        anchor_strides = self.approx_anchor_generator.strides
+        # Currently only supports same stride in x and y direction.
+        for stride in anchor_strides:
+            assert (stride[0] == stride[1])
+        anchor_strides = [stride[0] for stride in anchor_strides]
+
+        center_ratio = self.train_cfg.center_ratio
+        ignore_ratio = self.train_cfg.ignore_ratio
+        img_per_gpu = len(gt_bboxes_list)
+        num_lvls = len(featmap_sizes)
+        r1 = (1 - center_ratio) / 2
+        r2 = (1 - ignore_ratio) / 2
+        all_loc_targets = []
+        all_loc_weights = []
+        all_ignore_map = []
+        for lvl_id in range(num_lvls):
+            h, w = featmap_sizes[lvl_id]
+            loc_targets = torch.zeros(
+                img_per_gpu,
+                1,
+                h,
+                w,
+                device=gt_bboxes_list[0].device,
+                dtype=torch.float32)
+            loc_weights = torch.full_like(loc_targets, -1)
+            ignore_map = torch.zeros_like(loc_targets)
+            all_loc_targets.append(loc_targets)
+            all_loc_weights.append(loc_weights)
+            all_ignore_map.append(ignore_map)
+        for img_id in range(img_per_gpu):
+            gt_bboxes = gt_bboxes_list[img_id]
+            scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                               (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+            min_anchor_size = scale.new_full(
+                (1, ), float(anchor_scale * anchor_strides[0]))
+            # assign gt bboxes to different feature levels w.r.t. their scales
+            target_lvls = torch.floor(
+                torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+            target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+            for gt_id in range(gt_bboxes.size(0)):
+                lvl = target_lvls[gt_id].item()
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
+                # calculate ignore regions
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[lvl])
+                # calculate positive (center) regions
+                ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
+                    gt_, r1, featmap_sizes[lvl])
+                all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                     ignore_x1:ignore_x2 + 1] = 0
+                all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                # calculate ignore map on nearby low level feature
+                if lvl > 0:
+                    d_lvl = lvl - 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[d_lvl])
+                    all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+                # calculate ignore map on nearby high level feature
+                if lvl < num_lvls - 1:
+                    u_lvl = lvl + 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[u_lvl])
+                    all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+        for lvl_id in range(num_lvls):
+            # ignore negative regions w.r.t. ignore map
+            all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
+                                    & (all_ignore_map[lvl_id] > 0)] = 0
+            # set negative regions with weight 0.1
+            all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
+        # loc average factor to balance loss
+        loc_avg_factor = sum(
+            [t.size(0) * t.size(-1) * t.size(-2)
+             for t in all_loc_targets]) / 200
+        return all_loc_targets, all_loc_weights, loc_avg_factor
+
+    def _ga_shape_target_single(self,
+                                flat_approxs,
+                                inside_flags,
+                                flat_squares,
+                                gt_bboxes,
+                                gt_bboxes_ignore,
+                                img_meta,
+                                unmap_outputs=True):
+        """Compute guided anchoring targets.
+
+        This function returns sampled anchors and gt bboxes directly
+        rather than calculates regression targets.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_bboxes (Tensor): Ground truth bboxes of a single image.
+            img_meta (dict): Meta info of a single image.
+            approxs_per_octave (int): number of approxs per octave
+            cfg (dict): RPN train configs.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple
+        """
+        if not inside_flags.any():
+            return (None, ) * 5
+        # assign gt and sample anchors
+        expand_inside_flags = inside_flags[:, None].expand(
+            -1, self.approxs_per_octave).reshape(-1)
+        approxs = flat_approxs[expand_inside_flags, :]
+        squares = flat_squares[inside_flags, :]
+
+        assign_result = self.ga_assigner.assign(approxs, squares,
+                                                self.approxs_per_octave,
+                                                gt_bboxes, gt_bboxes_ignore)
+        sampling_result = self.ga_sampler.sample(assign_result, squares,
+                                                 gt_bboxes)
+
+        bbox_anchors = torch.zeros_like(squares)
+        bbox_gts = torch.zeros_like(squares)
+        bbox_weights = torch.zeros_like(squares)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
+            bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
+            bbox_weights[pos_inds, :] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
+            bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds)
+
+    def ga_shape_targets(self,
+                         approx_list,
+                         inside_flag_list,
+                         square_list,
+                         gt_bboxes_list,
+                         img_metas,
+                         gt_bboxes_ignore_list=None,
+                         unmap_outputs=True):
+        """Compute guided anchoring targets.
+
+        Args:
+            approx_list (list[list]): Multi level approxs of each image.
+            inside_flag_list (list[list]): Multi level inside flags of each
+                image.
+            square_list (list[list]): Multi level squares of each image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple
+        """
+        num_imgs = len(img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self._ga_shape_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             img_metas,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([bbox_anchors is None for bbox_anchors in all_bbox_anchors]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        bbox_anchors_list = images_to_levels(all_bbox_anchors,
+                                             num_level_squares)
+        bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_squares)
+        return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
+                num_total_pos, num_total_neg)
+
+    def loss_shape_single(self, shape_pred, bbox_anchors, bbox_gts,
+                          anchor_weights, anchor_total_num):
+        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
+        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
+        bbox_gts = bbox_gts.contiguous().view(-1, 4)
+        anchor_weights = anchor_weights.contiguous().view(-1, 4)
+        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
+        bbox_deltas[:, 2:] += shape_pred
+        # filter out negative samples to speed-up weighted_bounded_iou_loss
+        inds = torch.nonzero(
+            anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
+        bbox_deltas_ = bbox_deltas[inds]
+        bbox_anchors_ = bbox_anchors[inds]
+        bbox_gts_ = bbox_gts[inds]
+        anchor_weights_ = anchor_weights[inds]
+        pred_anchors_ = self.anchor_coder.decode(
+            bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
+        loss_shape = self.loss_shape(
+            pred_anchors_,
+            bbox_gts_,
+            anchor_weights_,
+            avg_factor=anchor_total_num)
+        return loss_shape
+
+    def loss_loc_single(self, loc_pred, loc_target, loc_weight,
+                        loc_avg_factor):
+        loss_loc = self.loss_loc(
+            loc_pred.reshape(-1, 1),
+            loc_target.reshape(-1).long(),
+            loc_weight.reshape(-1),
+            avg_factor=loc_avg_factor)
+        return loss_loc
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get loc targets
+        loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
+            gt_bboxes, featmap_sizes)
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = self.get_sampled_approxs(
+            featmap_sizes, img_metas, device=device)
+        # get squares and guided anchors
+        squares_list, guided_anchors_list, _ = self.get_anchors(
+            featmap_sizes, shape_preds, loc_preds, img_metas, device=device)
+
+        # get shape targets
+        shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
+                                              squares_list, gt_bboxes,
+                                              img_metas)
+        if shape_targets is None:
+            return None
+        (bbox_anchors_list, bbox_gts_list, anchor_weights_list, anchor_fg_num,
+         anchor_bg_num) = shape_targets
+        anchor_total_num = (
+            anchor_fg_num if not self.ga_sampling else anchor_fg_num +
+            anchor_bg_num)
+
+        # get anchor targets
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            guided_anchors_list,
+            inside_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [
+            anchors.size(0) for anchors in guided_anchors_list[0]
+        ]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        for i in range(len(guided_anchors_list)):
+            concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        # get classification and bbox regression losses
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+
+        # get anchor location loss
+        losses_loc = []
+        for i in range(len(loc_preds)):
+            loss_loc = self.loss_loc_single(
+                loc_preds[i],
+                loc_targets[i],
+                loc_weights[i],
+                loc_avg_factor=loc_avg_factor)
+            losses_loc.append(loss_loc)
+
+        # get anchor shape loss
+        losses_shape = []
+        for i in range(len(shape_preds)):
+            loss_shape = self.loss_shape_single(
+                shape_preds[i],
+                bbox_anchors_list[i],
+                bbox_gts_list[i],
+                anchor_weights_list[i],
+                anchor_total_num=anchor_total_num)
+            losses_shape.append(loss_shape)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_shape=losses_shape,
+            loss_loc=losses_loc)
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   shape_preds,
+                   loc_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False):
+        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
+            loc_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+        # get guided anchors
+        _, guided_anchors, loc_masks = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            img_metas,
+            use_loc_filter=not self.training,
+            device=device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            guided_anchor_list = [
+                guided_anchors[img_id][i].detach() for i in range(num_levels)
+            ]
+            loc_mask_list = [
+                loc_masks[img_id][i].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                                guided_anchor_list,
+                                                loc_mask_list, img_shape,
+                                                scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           mlvl_masks,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
+                                                       mlvl_anchors,
+                                                       mlvl_masks):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            # reshape scores and bbox_pred
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask, :]
+            bbox_pred = bbox_pred[mask, :]
+            if scores.dim() == 0:
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+                bbox_pred = bbox_pred.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        # multi class NMS
+        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+        return det_bboxes, det_labels
diff --git a/mmdet/models/dense_heads/lad_head.py b/mmdet/models/dense_heads/lad_head.py
new file mode 100755
index 0000000..85273bc
--- /dev/null
+++ b/mmdet/models/dense_heads/lad_head.py
@@ -0,0 +1,232 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.core import bbox_overlaps, multi_apply
+from ..builder import HEADS
+from .paa_head import PAAHead, levels_to_images
+
+
+@HEADS.register_module()
+class LADHead(PAAHead):
+    """Label Assignment Head from the paper: `Improving Object Detection by
+    Label Assignment Distillation <https://arxiv.org/pdf/2108.10520.pdf>`_"""
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'iou_preds'))
+    def get_label_assignment(self,
+                             cls_scores,
+                             bbox_preds,
+                             iou_preds,
+                             gt_bboxes,
+                             gt_labels,
+                             img_metas,
+                             gt_bboxes_ignore=None):
+        """Get label assignment (from teacher).
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when are computing the loss.
+
+        Returns:
+            tuple: Returns a tuple containing label assignment variables.
+
+                - labels (Tensor): Labels of all anchors, each with
+                    shape (num_anchors,).
+                - labels_weight (Tensor): Label weights of all anchor.
+                    each with shape (num_anchors,).
+                - bboxes_target (Tensor): BBox targets of all anchors.
+                    each with shape (num_anchors, 4).
+                - bboxes_weight (Tensor): BBox weights of all anchors.
+                    each with shape (num_anchors, 4).
+                - pos_inds_flatten (Tensor): Contains all index of positive
+                    sample in all anchor.
+                - pos_anchors (Tensor): Positive anchors.
+                - num_pos (int): Number of positive anchors.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        if num_pos:
+            pos_anchors = flatten_anchors[pos_inds_flatten]
+        else:
+            pos_anchors = None
+
+        label_assignment_results = (labels, labels_weight, bboxes_target,
+                                    bboxes_weight, pos_inds_flatten,
+                                    pos_anchors, num_pos)
+        return label_assignment_results
+
+    def forward_train(self,
+                      x,
+                      label_assignment_results,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      **kwargs):
+        """Forward train with the available label assignment (student receives
+        from teacher).
+
+        Args:
+            x (list[Tensor]): Features from FPN.
+            label_assignment_results (tuple): As the outputs defined in the
+                function `self.get_label_assignment`.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+
+        Returns:
+            losses: (dict[str, Tensor]): A dictionary of loss components.
+        """
+        outs = self(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(
+            *loss_inputs,
+            gt_bboxes_ignore=gt_bboxes_ignore,
+            label_assignment_results=label_assignment_results)
+        return losses
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'iou_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             iou_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None,
+             label_assignment_results=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when are computing the loss.
+            label_assignment_results (tuple): As the outputs defined in the
+                function `self.get_label_assignment`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds_flatten,
+         pos_anchors, num_pos) = label_assignment_results
+
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred, pos_bbox_target, avg_factor=num_pos)
+
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
diff --git a/mmdet/models/dense_heads/ld_head.py b/mmdet/models/dense_heads/ld_head.py
new file mode 100755
index 0000000..c5a945f
--- /dev/null
+++ b/mmdet/models/dense_heads/ld_head.py
@@ -0,0 +1,261 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.core import bbox_overlaps, multi_apply, reduce_mean
+from ..builder import HEADS, build_loss
+from .gfl_head import GFLHead
+
+
+@HEADS.register_module()
+class LDHead(GFLHead):
+    """Localization distillation Head. (Short description)
+
+    It utilizes the learned bbox distributions to transfer the localization
+    dark knowledge from teacher to student. Original paper: `Localization
+    Distillation for Object Detection. <https://arxiv.org/abs/2102.12252>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        loss_ld (dict): Config of Localization Distillation Loss (LD),
+            T is the temperature for distillation.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_ld=dict(
+                     type='LocalizationDistillationLoss',
+                     loss_weight=0.25,
+                     T=10),
+                 **kwargs):
+
+        super(LDHead, self).__init__(num_classes, in_channels, **kwargs)
+        self.loss_ld = build_loss(loss_ld)
+
+    def loss_single(self, anchors, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, stride, soft_targets, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            stride (tuple): Stride in this scale level.
+            num_total_samples (int): Number of positive samples that is
+                reduced over all GPUs.
+
+        Returns:
+            dict[tuple, Tensor]: Loss components and weight targets.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        soft_targets = soft_targets.permute(0, 2, 3,
+                                            1).reshape(-1,
+                                                       4 * (self.reg_max + 1))
+
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            pos_soft_targets = soft_targets[pos_inds]
+            soft_corners = pos_soft_targets.reshape(-1, self.reg_max + 1)
+
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+            # ld loss
+            loss_ld = self.loss_ld(
+                pred_corners,
+                soft_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+        else:
+            loss_ld = bbox_pred.sum() * 0
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=num_total_samples)
+
+        return loss_cls, loss_bbox, loss_dfl, loss_ld, weight_targets.sum()
+
+    def forward_train(self,
+                      x,
+                      out_teacher,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+
+        Returns:
+            tuple[dict, list]: The loss components and proposals of each image.
+
+            - losses (dict[str, Tensor]): A dictionary of loss components.
+            - proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x)
+        soft_target = out_teacher[1]
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, soft_target, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, soft_target, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             soft_target,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos, dtype=torch.float,
+                         device=device)).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        losses_cls, losses_bbox, losses_dfl, losses_ld, \
+            avg_factor = multi_apply(
+                self.loss_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                soft_target,
+                num_total_samples=num_total_samples)
+
+        avg_factor = sum(avg_factor) + 1e-6
+        avg_factor = reduce_mean(avg_factor).item()
+        losses_bbox = [x / avg_factor for x in losses_bbox]
+        losses_dfl = [x / avg_factor for x in losses_dfl]
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_dfl=losses_dfl,
+            loss_ld=losses_ld)
diff --git a/mmdet/models/dense_heads/mask2former_head.py b/mmdet/models/dense_heads/mask2former_head.py
new file mode 100755
index 0000000..59047bd
--- /dev/null
+++ b/mmdet/models/dense_heads/mask2former_head.py
@@ -0,0 +1,430 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.ops import point_sample
+from mmcv.runner import ModuleList
+
+from mmdet.core import build_assigner, build_sampler, reduce_mean
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+from .maskformer_head import MaskFormerHead
+
+
+@HEADS.register_module()
+class Mask2FormerHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
+            Mask2Former head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_queries=100,
+                 num_transformer_feat_level=3,
+                 pixel_decoder=None,
+                 enforce_decoder_input_project=False,
+                 transformer_decoder=None,
+                 positional_encoding=None,
+                 loss_cls=None,
+                 loss_mask=None,
+                 loss_dice=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.transformerlayers.\
+            attn_cfgs.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.transformerlayers.\
+            attn_cfgs.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            self.sampler = build_sampler(self.train_cfg.sampler, context=self)
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
+                           img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (num_gts, ).
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (num_gts, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+        """
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_points_pred,
+                                             gt_labels, gt_points_masks,
+                                             img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred,
+                                              gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds)
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (num_gts, ).
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (num_gts, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         num_total_pos,
+         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
+                                           gt_labels_list, gt_masks_list,
+                                           img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+            - cls_pred (Tensor): Classification scores in shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred (Tensor): Mask scores in shape \
+                (batch_size, num_queries,h, w).
+            - attn_mask (Tensor): Attention mask in shape \
+                (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        decoder_out = decoder_out.transpose(0, 1)
+        # shape (batch_size, num_queries, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (batch_size, num_queries, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (batch_size, num_queries, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (batch_size, num_queries, h, w) ->
+        #   (batch_size * num_head, num_queries, h*w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: A tuple contains two elements.
+
+            - cls_pred_list (list[Tensor)]: Classification logits \
+                for each decoder layer. Each is a 3D-tensor with shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred_list (list[Tensor]): Mask logits for each \
+                decoder layer. Each with shape (batch_size, num_queries, \
+                 h, w).
+        """
+        batch_size = len(img_metas)
+        mask_features, multi_scale_memorys = self.pixel_decoder(feats)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(2, 0, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (num_queries, batch_size, c)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self.forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            attn_masks = [attn_mask, None]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                attn_masks=attn_masks,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self.forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py
new file mode 100755
index 0000000..566dc07
--- /dev/null
+++ b/mmdet/models/dense_heads/maskformer_head.py
@@ -0,0 +1,556 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.runner import force_fp32
+
+from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
+from mmdet.models.utils import preprocess_panoptic_gt
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+
+@HEADS.register_module()
+class MaskFormerHead(AnchorFreeHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic
+    Segmentation <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add a layer
+            to change the embed_dim of tranformer encoder in pixel decoder to
+            the embed_dim of transformer decoder. Defaults to False.
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+            Defaults to `FocalLoss`.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+            Defaults to `DiceLoss`.
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Maskformer head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of Maskformer
+            head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_queries=100,
+                 pixel_decoder=None,
+                 enforce_decoder_input_project=False,
+                 transformer_decoder=None,
+                 positional_encoding=None,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=20.0),
+                 loss_dice=dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     naive_dice=True,
+                     loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+
+        pixel_decoder.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+        pixel_decoder_type = pixel_decoder.get('type')
+        if pixel_decoder_type == 'PixelDecoder' and (
+                self.decoder_embed_dims != in_channels[-1]
+                or enforce_decoder_input_project):
+            self.decoder_input_proj = Conv2d(
+                in_channels[-1], self.decoder_embed_dims, kernel_size=1)
+        else:
+            self.decoder_input_proj = nn.Identity()
+        self.decoder_pe = build_positional_encoding(positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, out_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = build_assigner(train_cfg.get('assigner', None))
+            self.sampler = build_sampler(
+                train_cfg.get('sampler', None), context=self)
+
+        self.class_weight = loss_cls.get('class_weight', None)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        if isinstance(self.decoder_input_proj, Conv2d):
+            caffe2_xavier_init(self.decoder_input_proj, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs,
+                      img_metas):
+        """Preprocess the ground truth for all images.
+
+        Args:
+            gt_labels_list (list[Tensor]): Each is ground truth
+                labels of each bbox, with shape (num_gts, ).
+            gt_masks_list (list[BitmapMasks]): Each is ground truth
+                masks of each instances of a image, shape
+                (num_gts, h, w).
+            gt_semantic_seg (Tensor | None): Ground truth of semantic
+                segmentation with the shape (batch_size, n, h, w).
+                [0, num_thing_class - 1] means things,
+                [num_thing_class, num_class-1] means stuff,
+                255 means VOID. It's None when training instance segmentation.
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels (list[Tensor]): Ground truth class indices\
+                    for all images. Each with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in a image.
+                - masks (list[Tensor]): Ground truth mask for each\
+                    image, each with shape (n, h, w).
+        """
+        num_things_list = [self.num_things_classes] * len(gt_labels_list)
+        num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list)
+        if gt_semantic_segs is None:
+            gt_semantic_segs = [None] * len(gt_labels_list)
+
+        targets = multi_apply(preprocess_panoptic_gt, gt_labels_list,
+                              gt_masks_list, gt_semantic_segs, num_things_list,
+                              num_stuff_list, img_metas)
+        labels, masks = targets
+        return labels, masks
+
+    def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for all
+                images. Each with shape (n, ), n is the sum of number of stuff
+                type and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[list[Tensor]]: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - label_weights_list (list[Tensor]): Label weights\
+                    of all images. Each with shape (num_queries, ).
+                - mask_targets_list (list[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights_list (list[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - num_total_pos (int): Number of positive samples in\
+                    all images.
+                - num_total_neg (int): Number of negative samples in\
+                    all images.
+        """
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
+                                      mask_preds_list, gt_labels_list,
+                                      gt_masks_list, img_metas)
+
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, mask_targets_list,
+                mask_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
+                           img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (n, ). n is the sum of number of stuff type and number
+                of instance in a image.
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (n, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image.
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image.
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image.
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        target_shape = mask_pred.shape[-2:]
+        if gt_masks.shape[0] > 0:
+            gt_masks_downsampled = F.interpolate(
+                gt_masks.unsqueeze(1).float(), target_shape,
+                mode='nearest').squeeze(1).long()
+        else:
+            gt_masks_downsampled = gt_masks
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_pred, gt_labels,
+                                             gt_masks_downsampled, img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred,
+                                              gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones(self.num_queries)
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds)
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_mask_preds'))
+    def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
+             gt_masks_list, img_metas):
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (n, ). n is the sum of number of stuff type
+                and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image with
+                shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self.loss_single, all_cls_scores, all_mask_preds,
+            all_gt_labels_list, all_gt_masks_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (n, ). n is the sum of number of stuff
+                types and number of instances in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single decoder\
+                layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         num_total_pos,
+         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
+                                           gt_labels_list, gt_masks_list,
+                                           img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+        target_shape = mask_targets.shape[-2:]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        # upsample to shape of target
+        # shape (num_total_gts, h, w)
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(1),
+            target_shape,
+            mode='bilinear',
+            align_corners=False).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_preds, mask_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # FocalLoss support input of shape (n, num_class)
+        h, w = mask_preds.shape[-2:]
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w, 1)
+        mask_preds = mask_preds.reshape(-1, 1)
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w)
+        mask_targets = mask_targets.reshape(-1)
+        # target is (1 - mask_targets) !!!
+        loss_mask = self.loss_mask(
+            mask_preds, 1 - mask_targets, avg_factor=num_total_masks * h * w)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: a tuple contains two elements.
+                - all_cls_scores (Tensor): Classification scores for each\
+                    scale level. Each is a 4D-tensor with shape\
+                    (num_decoder, batch_size, num_queries, cls_out_channels).\
+                    Note `cls_out_channels` should includes background.
+                - all_mask_preds (Tensor): Mask scores for each decoder\
+                    layer. Each with shape (num_decoder, batch_size,\
+                    num_queries, h, w).
+        """
+        batch_size = len(img_metas)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        padding_mask = feats[-1].new_ones(
+            (batch_size, input_img_h, input_img_w), dtype=torch.float32)
+        for i in range(batch_size):
+            img_h, img_w, _ = img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1),
+            size=feats[-1].shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+        # when backbone is swin, memory is output of last stage of swin.
+        # when backbone is r50, memory is output of tranformer encoder.
+        mask_features, memory = self.pixel_decoder(feats, img_metas)
+        pos_embed = self.decoder_pe(padding_mask)
+        memory = self.decoder_input_proj(memory)
+        # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+        memory = memory.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        # shape (batch_size, h * w)
+        padding_mask = padding_mask.flatten(1)
+        # shape = (num_queries, embed_dims)
+        query_embed = self.query_embed.weight
+        # shape = (num_queries, batch_size, embed_dims)
+        query_embed = query_embed.unsqueeze(1).repeat(1, batch_size, 1)
+        target = torch.zeros_like(query_embed)
+        # shape (num_decoder, num_queries, batch_size, embed_dims)
+        out_dec = self.transformer_decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=padding_mask)
+        # shape (num_decoder, batch_size, num_queries, embed_dims)
+        out_dec = out_dec.transpose(1, 2)
+
+        # cls_scores
+        all_cls_scores = self.cls_embed(out_dec)
+
+        # mask_preds
+        mask_embed = self.mask_embed(out_dec)
+        all_mask_preds = torch.einsum('lbqc,bchw->lbqhw', mask_embed,
+                                      mask_features)
+
+        return all_cls_scores, all_mask_preds
+
+    def forward_train(self,
+                      feats,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_masks,
+                      gt_semantic_seg,
+                      gt_bboxes_ignore=None):
+        """Forward function for training mode.
+
+        Args:
+            feats (list[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            img_metas (list[Dict]): List of image information.
+            gt_bboxes (list[Tensor]): Each element is ground truth bboxes of
+                the image, shape (num_gts, 4). Not used here.
+            gt_labels (list[Tensor]): Each element is ground truth labels of
+                each box, shape (num_gts,).
+            gt_masks (list[BitmapMasks]): Each element is masks of instances
+                of a image, shape (num_gts, h, w).
+            gt_semantic_seg (list[tensor] | None): Each element is the ground
+                truth of semantic segmentation with the shape (N, H, W).
+                [0, num_thing_class - 1] means things,
+                [num_thing_class, num_class-1] means stuff,
+                255 means VOID. It's None when training instance segmentation.
+            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
+                ignored. Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # not consider ignoring bboxes
+        assert gt_bboxes_ignore is None
+
+        # forward
+        all_cls_scores, all_mask_preds = self(feats, img_metas)
+
+        # preprocess ground truth
+        gt_labels, gt_masks = self.preprocess_gt(gt_labels, gt_masks,
+                                                 gt_semantic_seg, img_metas)
+
+        # loss
+        losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks,
+                           img_metas)
+
+        return losses
+
+    def simple_test(self, feats, img_metas, **kwargs):
+        """Test without augmentaton.
+
+        Args:
+            feats (list[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: A tuple contains two tensors.
+
+            - mask_cls_results (Tensor): Mask classification logits,\
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should includes background.
+            - mask_pred_results (Tensor): Mask logits, shape \
+                (batch_size, num_queries, h, w).
+        """
+        all_cls_scores, all_mask_preds = self(feats, img_metas)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        return mask_cls_results, mask_pred_results
diff --git a/mmdet/models/dense_heads/nasfcos_head.py b/mmdet/models/dense_heads/nasfcos_head.py
new file mode 100755
index 0000000..380c912
--- /dev/null
+++ b/mmdet/models/dense_heads/nasfcos_head.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+
+from mmdet.models.dense_heads.fcos_head import FCOSHead
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class NASFCOSHead(FCOSHead):
+    """Anchor-free head used in `NASFCOS <https://arxiv.org/abs/1906.04423>`_.
+
+    It is quite similar with FCOS head, except for the searched structure of
+    classification branch and bbox regression branch, where a structure of
+    "dconv3x3, conv3x3, dconv3x3, conv1x1" is utilized instead.
+    """
+
+    def __init__(self, *args, init_cfg=None, **kwargs):
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Caffe2Xavier', layer=['ConvModule', 'Conv2d']),
+                dict(
+                    type='Normal',
+                    std=0.01,
+                    override=[
+                        dict(name='conv_reg'),
+                        dict(name='conv_centerness'),
+                        dict(
+                            name='conv_cls',
+                            type='Normal',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ]),
+            ]
+        super(NASFCOSHead, self).__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        dconv3x3_config = dict(
+            type='DCNv2',
+            kernel_size=3,
+            use_bias=True,
+            deform_groups=2,
+            padding=1)
+        conv3x3_config = dict(type='Conv', kernel_size=3, padding=1)
+        conv1x1_config = dict(type='Conv', kernel_size=1)
+
+        self.arch_config = [
+            dconv3x3_config, conv3x3_config, dconv3x3_config, conv1x1_config
+        ]
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i, op_ in enumerate(self.arch_config):
+            op = copy.deepcopy(op_)
+            chn = self.in_channels if i == 0 else self.feat_channels
+            assert isinstance(op, dict)
+            use_bias = op.pop('use_bias', False)
+            padding = op.pop('padding', 0)
+            kernel_size = op.pop('kernel_size')
+            module = ConvModule(
+                chn,
+                self.feat_channels,
+                kernel_size,
+                stride=1,
+                padding=padding,
+                norm_cfg=self.norm_cfg,
+                bias=use_bias,
+                conv_cfg=op)
+
+            self.cls_convs.append(copy.deepcopy(module))
+            self.reg_convs.append(copy.deepcopy(module))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
diff --git a/mmdet/models/dense_heads/paa_head.py b/mmdet/models/dense_heads/paa_head.py
new file mode 100755
index 0000000..d79b5b9
--- /dev/null
+++ b/mmdet/models/dense_heads/paa_head.py
@@ -0,0 +1,756 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply, multiclass_nms
+from mmdet.core.bbox.iou_calculators import bbox_overlaps
+from mmdet.models import HEADS
+from mmdet.models.dense_heads import ATSSHead
+
+EPS = 1e-12
+try:
+    import sklearn.mixture as skm
+except ImportError:
+    skm = None
+
+
+def levels_to_images(mlvl_tensor):
+    """Concat multi-level feature maps by image.
+
+    [feature_level0, feature_level1...] -> [feature_image0, feature_image1...]
+    Convert the shape of each element in mlvl_tensor from (N, C, H, W) to
+    (N, H*W , C), then split the element to N elements with shape (H*W, C), and
+    concat elements in same image of all level along first dimension.
+
+    Args:
+        mlvl_tensor (list[torch.Tensor]): list of Tensor which collect from
+            corresponding level. Each element is of shape (N, C, H, W)
+
+    Returns:
+        list[torch.Tensor]: A list that contains N tensors and each tensor is
+            of shape (num_elements, C)
+    """
+    batch_size = mlvl_tensor[0].size(0)
+    batch_list = [[] for _ in range(batch_size)]
+    channels = mlvl_tensor[0].size(1)
+    for t in mlvl_tensor:
+        t = t.permute(0, 2, 3, 1)
+        t = t.view(batch_size, -1, channels).contiguous()
+        for img in range(batch_size):
+            batch_list[img].append(t[img])
+    return [torch.cat(item, 0) for item in batch_list]
+
+
+@HEADS.register_module()
+class PAAHead(ATSSHead):
+    """Head of PAAAssignment: Probabilistic Anchor Assignment with IoU
+    Prediction for Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/kkhoot/PAA/blob/master/paa_core
+    /modeling/rpn/paa/loss.py>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.08103>`_ .
+
+    Args:
+        topk (int): Select topk samples with smallest loss in
+            each level.
+        score_voting (bool): Whether to use score voting in post-process.
+        covariance_type : String describing the type of covariance parameters
+            to be used in :class:`sklearn.mixture.GaussianMixture`.
+            It must be one of:
+
+            - 'full': each component has its own general covariance matrix
+            - 'tied': all components share the same general covariance matrix
+            - 'diag': each component has its own diagonal covariance matrix
+            - 'spherical': each component has its own single variance
+            Default: 'diag'. From 'full' to 'spherical', the gmm fitting
+            process is faster yet the performance could be influenced. For most
+            cases, 'diag' should be a good choice.
+    """
+
+    def __init__(self,
+                 *args,
+                 topk=9,
+                 score_voting=True,
+                 covariance_type='diag',
+                 **kwargs):
+        # topk used in paa reassign process
+        self.topk = topk
+        self.with_score_voting = score_voting
+        self.covariance_type = covariance_type
+        super(PAAHead, self).__init__(*args, **kwargs)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'iou_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             iou_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when are computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                flatten_anchors[pos_inds_flatten],
+                bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_target,
+                iou_target.clamp(min=EPS),
+                avg_factor=iou_target.sum())
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
+
+    def get_pos_loss(self, anchors, cls_score, bbox_pred, label, label_weight,
+                     bbox_target, bbox_weight, pos_inds):
+        """Calculate loss of all potential positive samples obtained from first
+        match process.
+
+        Args:
+            anchors (list[Tensor]): Anchors of each scale.
+            cls_score (Tensor): Box scores of single image with shape
+                (num_anchors, num_classes)
+            bbox_pred (Tensor): Box energies / deltas of single image
+                with shape (num_anchors, 4)
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_target (dict): Regression target of each anchor with
+                shape (num_anchors, 4).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+
+        Returns:
+            Tensor: Losses of all positive samples in single image.
+        """
+        if not len(pos_inds):
+            return cls_score.new([]),
+        anchors_all_level = torch.cat(anchors, 0)
+        pos_scores = cls_score[pos_inds]
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_label = label[pos_inds]
+        pos_label_weight = label_weight[pos_inds]
+        pos_bbox_target = bbox_target[pos_inds]
+        pos_bbox_weight = bbox_weight[pos_inds]
+        pos_anchors = anchors_all_level[pos_inds]
+        pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred)
+
+        # to keep loss dimension
+        loss_cls = self.loss_cls(
+            pos_scores,
+            pos_label,
+            pos_label_weight,
+            avg_factor=1.0,
+            reduction_override='none')
+
+        loss_bbox = self.loss_bbox(
+            pos_bbox_pred,
+            pos_bbox_target,
+            pos_bbox_weight,
+            avg_factor=1.0,  # keep same loss weight before reassign
+            reduction_override='none')
+
+        loss_cls = loss_cls.sum(-1)
+        pos_loss = loss_bbox + loss_cls
+        return pos_loss,
+
+    def paa_reassign(self, pos_losses, label, label_weight, bbox_weight,
+                     pos_inds, pos_gt_inds, anchors):
+        """Fit loss to GMM distribution and separate positive, ignore, negative
+        samples again with GMM model.
+
+        Args:
+            pos_losses (Tensor): Losses of all positive samples in
+                single image.
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+            pos_gt_inds (Tensor): Gt_index of all positive samples got
+                from first assign process.
+            anchors (list[Tensor]): Anchors of each scale.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - label (Tensor): classification target of each anchor after
+                  paa assign, with shape (num_anchors,)
+                - label_weight (Tensor): Classification loss weight of each
+                  anchor after paa assign, with shape (num_anchors).
+                - bbox_weight (Tensor): Bbox weight of each anchor with shape
+                  (num_anchors, 4).
+                - num_pos (int): The number of positive samples after paa
+                  assign.
+        """
+        if not len(pos_inds):
+            return label, label_weight, bbox_weight, 0
+        label = label.clone()
+        label_weight = label_weight.clone()
+        bbox_weight = bbox_weight.clone()
+        num_gt = pos_gt_inds.max() + 1
+        num_level = len(anchors)
+        num_anchors_each_level = [item.size(0) for item in anchors]
+        num_anchors_each_level.insert(0, 0)
+        inds_level_interval = np.cumsum(num_anchors_each_level)
+        pos_level_mask = []
+        for i in range(num_level):
+            mask = (pos_inds >= inds_level_interval[i]) & (
+                pos_inds < inds_level_interval[i + 1])
+            pos_level_mask.append(mask)
+        pos_inds_after_paa = [label.new_tensor([])]
+        ignore_inds_after_paa = [label.new_tensor([])]
+        for gt_ind in range(num_gt):
+            pos_inds_gmm = []
+            pos_loss_gmm = []
+            gt_mask = pos_gt_inds == gt_ind
+            for level in range(num_level):
+                level_mask = pos_level_mask[level]
+                level_gt_mask = level_mask & gt_mask
+                value, topk_inds = pos_losses[level_gt_mask].topk(
+                    min(level_gt_mask.sum(), self.topk), largest=False)
+                pos_inds_gmm.append(pos_inds[level_gt_mask][topk_inds])
+                pos_loss_gmm.append(value)
+            pos_inds_gmm = torch.cat(pos_inds_gmm)
+            pos_loss_gmm = torch.cat(pos_loss_gmm)
+            # fix gmm need at least two sample
+            if len(pos_inds_gmm) < 2:
+                continue
+            device = pos_inds_gmm.device
+            pos_loss_gmm, sort_inds = pos_loss_gmm.sort()
+            pos_inds_gmm = pos_inds_gmm[sort_inds]
+            pos_loss_gmm = pos_loss_gmm.view(-1, 1).cpu().numpy()
+            min_loss, max_loss = pos_loss_gmm.min(), pos_loss_gmm.max()
+            means_init = np.array([min_loss, max_loss]).reshape(2, 1)
+            weights_init = np.array([0.5, 0.5])
+            precisions_init = np.array([1.0, 1.0]).reshape(2, 1, 1)  # full
+            if self.covariance_type == 'spherical':
+                precisions_init = precisions_init.reshape(2)
+            elif self.covariance_type == 'diag':
+                precisions_init = precisions_init.reshape(2, 1)
+            elif self.covariance_type == 'tied':
+                precisions_init = np.array([[1.0]])
+            if skm is None:
+                raise ImportError('Please run "pip install sklearn" '
+                                  'to install sklearn first.')
+            gmm = skm.GaussianMixture(
+                2,
+                weights_init=weights_init,
+                means_init=means_init,
+                precisions_init=precisions_init,
+                covariance_type=self.covariance_type)
+            gmm.fit(pos_loss_gmm)
+            gmm_assignment = gmm.predict(pos_loss_gmm)
+            scores = gmm.score_samples(pos_loss_gmm)
+            gmm_assignment = torch.from_numpy(gmm_assignment).to(device)
+            scores = torch.from_numpy(scores).to(device)
+
+            pos_inds_temp, ignore_inds_temp = self.gmm_separation_scheme(
+                gmm_assignment, scores, pos_inds_gmm)
+            pos_inds_after_paa.append(pos_inds_temp)
+            ignore_inds_after_paa.append(ignore_inds_temp)
+
+        pos_inds_after_paa = torch.cat(pos_inds_after_paa)
+        ignore_inds_after_paa = torch.cat(ignore_inds_after_paa)
+        reassign_mask = (pos_inds.unsqueeze(1) != pos_inds_after_paa).all(1)
+        reassign_ids = pos_inds[reassign_mask]
+        label[reassign_ids] = self.num_classes
+        label_weight[ignore_inds_after_paa] = 0
+        bbox_weight[reassign_ids] = 0
+        num_pos = len(pos_inds_after_paa)
+        return label, label_weight, bbox_weight, num_pos
+
+    def gmm_separation_scheme(self, gmm_assignment, scores, pos_inds_gmm):
+        """A general separation scheme for gmm model.
+
+        It separates a GMM distribution of candidate samples into three
+        parts, 0 1 and uncertain areas, and you can implement other
+        separation schemes by rewriting this function.
+
+        Args:
+            gmm_assignment (Tensor): The prediction of GMM which is of shape
+                (num_samples,). The 0/1 value indicates the distribution
+                that each sample comes from.
+            scores (Tensor): The probability of sample coming from the
+                fit GMM distribution. The tensor is of shape (num_samples,).
+            pos_inds_gmm (Tensor): All the indexes of samples which are used
+                to fit GMM model. The tensor is of shape (num_samples,)
+
+        Returns:
+            tuple[Tensor]: The indices of positive and ignored samples.
+
+                - pos_inds_temp (Tensor): Indices of positive samples.
+                - ignore_inds_temp (Tensor): Indices of ignore samples.
+        """
+        # The implementation is (c) in Fig.3 in origin paper instead of (b).
+        # You can refer to issues such as
+        # https://github.com/kkhoot/PAA/issues/8 and
+        # https://github.com/kkhoot/PAA/issues/9.
+        fgs = gmm_assignment == 0
+        pos_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        ignore_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        if fgs.nonzero().numel():
+            _, pos_thr_ind = scores[fgs].topk(1)
+            pos_inds_temp = pos_inds_gmm[fgs][:pos_thr_ind + 1]
+            ignore_inds_temp = pos_inds_gmm.new_tensor([])
+        return pos_inds_temp, ignore_inds_temp
+
+    def get_targets(
+        self,
+        anchor_list,
+        valid_flag_list,
+        gt_bboxes_list,
+        img_metas,
+        gt_bboxes_ignore_list=None,
+        gt_labels_list=None,
+        label_channels=1,
+        unmap_outputs=True,
+    ):
+        """Get targets for PAA head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. We direct
+        return the results from _get_targets_single instead map it to levels
+        by images_to_levels function.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels (list[Tensor]): Labels of all anchors, each with
+                    shape (num_anchors,).
+                - label_weights (list[Tensor]): Label weights of all anchor.
+                    each with shape (num_anchors,).
+                - bbox_targets (list[Tensor]): BBox targets of all anchors.
+                    each with shape (num_anchors, 4).
+                - bbox_weights (list[Tensor]): BBox weights of all anchors.
+                    each with shape (num_anchors, 4).
+                - pos_inds (list[Tensor]): Contains all index of positive
+                    sample in all anchor.
+                - gt_inds (list[Tensor]): Contains all gt_index of positive
+                    sample in all anchor.
+        """
+
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            gt_bboxes_list,
+            gt_bboxes_ignore_list,
+            gt_labels_list,
+            img_metas,
+            label_channels=label_channels,
+            unmap_outputs=unmap_outputs)
+
+        (labels, label_weights, bbox_targets, bbox_weights, valid_pos_inds,
+         valid_neg_inds, sampling_result) = results
+
+        # Due to valid flag of anchors, we have to calculate the real pos_inds
+        # in origin anchor set.
+        pos_inds = []
+        for i, single_labels in enumerate(labels):
+            pos_mask = (0 <= single_labels) & (
+                single_labels < self.num_classes)
+            pos_inds.append(pos_mask.nonzero().view(-1))
+
+        gt_inds = [item.pos_assigned_gt_inds for item in sampling_result]
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                gt_inds)
+
+    def _get_targets_single(self,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        This method is same as `AnchorHead._get_targets_single()`.
+        """
+        assert unmap_outputs, 'We must map outputs back to the original' \
+                              'set of anchors in PAAhead'
+        return super(ATSSHead, self)._get_targets_single(
+            flat_anchors,
+            valid_flags,
+            gt_bboxes,
+            gt_bboxes_ignore,
+            gt_labels,
+            img_meta,
+            label_channels=1,
+            unmap_outputs=True)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   score_factors=None,
+                   img_metas=None,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True,
+                   **kwargs):
+        assert with_nms, 'PAA only supports "with_nms=True" now and it ' \
+                         'means PAAHead does not support ' \
+                         'test-time augmentation'
+        return super(ATSSHead, self).get_bboxes(cls_scores, bbox_preds,
+                                                score_factors, img_metas, cfg,
+                                                rescale, with_nms, **kwargs)
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_priors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factors from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_score_factors = []
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            if 0 < nms_pre < scores.shape[0]:
+                max_scores, _ = (scores *
+                                 score_factor[:, None]).sqrt().max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                score_factor = score_factor[topk_inds]
+
+            bboxes = self.bbox_coder.decode(
+                priors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_score_factors.append(score_factor)
+
+        return self._bbox_post_process(mlvl_scores, mlvl_bboxes,
+                                       img_meta['scale_factor'], cfg, rescale,
+                                       with_nms, mlvl_score_factors, **kwargs)
+
+    def _bbox_post_process(self,
+                           mlvl_scores,
+                           mlvl_bboxes,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           mlvl_score_factors=None,
+                           **kwargs):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually with_nms is False is used for aug test.
+
+        Args:
+            mlvl_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, num_class).
+            mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale
+                levels of a single image, each item has shape (num_bboxes, 4).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            mlvl_score_factors (list[Tensor], optional): Score factor from
+                all scale levels of a single image, each item has shape
+                (num_bboxes, ). Default: None.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        mlvl_iou_preds = torch.cat(mlvl_score_factors)
+        mlvl_nms_scores = (mlvl_scores * mlvl_iou_preds[:, None]).sqrt()
+        det_bboxes, det_labels = multiclass_nms(
+            mlvl_bboxes,
+            mlvl_nms_scores,
+            cfg.score_thr,
+            cfg.nms,
+            cfg.max_per_img,
+            score_factors=None)
+        if self.with_score_voting and len(det_bboxes) > 0:
+            det_bboxes, det_labels = self.score_voting(det_bboxes, det_labels,
+                                                       mlvl_bboxes,
+                                                       mlvl_nms_scores,
+                                                       cfg.score_thr)
+
+        return det_bboxes, det_labels
+
+    def score_voting(self, det_bboxes, det_labels, mlvl_bboxes,
+                     mlvl_nms_scores, score_thr):
+        """Implementation of score voting method works on each remaining boxes
+        after NMS procedure.
+
+        Args:
+            det_bboxes (Tensor): Remaining boxes after NMS procedure,
+                with shape (k, 5), each dimension means
+                (x1, y1, x2, y2, score).
+            det_labels (Tensor): The label of remaining boxes, with shape
+                (k, 1),Labels are 0-based.
+            mlvl_bboxes (Tensor): All boxes before the NMS procedure,
+                with shape (num_anchors,4).
+            mlvl_nms_scores (Tensor): The scores of all boxes which is used
+                in the NMS procedure, with shape (num_anchors, num_class)
+            score_thr (float): The score threshold of bboxes.
+
+        Returns:
+            tuple: Usually returns a tuple containing voting results.
+
+                - det_bboxes_voted (Tensor): Remaining boxes after
+                    score voting procedure, with shape (k, 5), each
+                    dimension means (x1, y1, x2, y2, score).
+                - det_labels_voted (Tensor): Label of remaining bboxes
+                    after voting, with shape (num_anchors,).
+        """
+        candidate_mask = mlvl_nms_scores > score_thr
+        candidate_mask_nonzeros = candidate_mask.nonzero(as_tuple=False)
+        candidate_inds = candidate_mask_nonzeros[:, 0]
+        candidate_labels = candidate_mask_nonzeros[:, 1]
+        candidate_bboxes = mlvl_bboxes[candidate_inds]
+        candidate_scores = mlvl_nms_scores[candidate_mask]
+        det_bboxes_voted = []
+        det_labels_voted = []
+        for cls in range(self.cls_out_channels):
+            candidate_cls_mask = candidate_labels == cls
+            if not candidate_cls_mask.any():
+                continue
+            candidate_cls_scores = candidate_scores[candidate_cls_mask]
+            candidate_cls_bboxes = candidate_bboxes[candidate_cls_mask]
+            det_cls_mask = det_labels == cls
+            det_cls_bboxes = det_bboxes[det_cls_mask].view(
+                -1, det_bboxes.size(-1))
+            det_candidate_ious = bbox_overlaps(det_cls_bboxes[:, :4],
+                                               candidate_cls_bboxes)
+            for det_ind in range(len(det_cls_bboxes)):
+                single_det_ious = det_candidate_ious[det_ind]
+                pos_ious_mask = single_det_ious > 0.01
+                pos_ious = single_det_ious[pos_ious_mask]
+                pos_bboxes = candidate_cls_bboxes[pos_ious_mask]
+                pos_scores = candidate_cls_scores[pos_ious_mask]
+                pis = (torch.exp(-(1 - pos_ious)**2 / 0.025) *
+                       pos_scores)[:, None]
+                voted_box = torch.sum(
+                    pis * pos_bboxes, dim=0) / torch.sum(
+                        pis, dim=0)
+                voted_score = det_cls_bboxes[det_ind][-1:][None, :]
+                det_bboxes_voted.append(
+                    torch.cat((voted_box[None, :], voted_score), dim=1))
+                det_labels_voted.append(cls)
+
+        det_bboxes_voted = torch.cat(det_bboxes_voted, dim=0)
+        det_labels_voted = det_labels.new_tensor(det_labels_voted)
+        return det_bboxes_voted, det_labels_voted
diff --git a/mmdet/models/dense_heads/pisa_retinanet_head.py b/mmdet/models/dense_heads/pisa_retinanet_head.py
new file mode 100755
index 0000000..8654ef4
--- /dev/null
+++ b/mmdet/models/dense_heads/pisa_retinanet_head.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.core import images_to_levels
+from ..builder import HEADS
+from ..losses import carl_loss, isr_p
+from .retina_head import RetinaHead
+
+
+@HEADS.register_module()
+class PISARetinaHead(RetinaHead):
+    """PISA Retinanet Head.
+
+    The head owns the same structure with Retinanet Head, but differs in two
+        aspects:
+        1. Importance-based Sample Reweighting Positive (ISR-P) is applied to
+            change the positive loss weights.
+        2. Classification-aware regression loss is adopted as a third loss.
+    """
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image
+                with shape (num_obj, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each image
+                with shape (num_obj, 4).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor]): Ignored gt bboxes of each image.
+                Default: None.
+
+        Returns:
+            dict: Loss dict, comprise classification loss, regression loss and
+                carl loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg, sampling_results_list) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        num_imgs = len(img_metas)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, label_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_cls_scores = torch.cat(
+            flatten_cls_scores, dim=1).reshape(-1,
+                                               flatten_cls_scores[0].size(-1))
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds = torch.cat(
+            flatten_bbox_preds, dim=1).view(-1, flatten_bbox_preds[0].size(-1))
+        flatten_labels = torch.cat(labels_list, dim=1).reshape(-1)
+        flatten_label_weights = torch.cat(
+            label_weights_list, dim=1).reshape(-1)
+        flatten_anchors = torch.cat(all_anchor_list, dim=1).reshape(-1, 4)
+        flatten_bbox_targets = torch.cat(
+            bbox_targets_list, dim=1).reshape(-1, 4)
+        flatten_bbox_weights = torch.cat(
+            bbox_weights_list, dim=1).reshape(-1, 4)
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            all_targets = (flatten_labels, flatten_label_weights,
+                           flatten_bbox_targets, flatten_bbox_weights)
+            with torch.no_grad():
+                all_targets = isr_p(
+                    flatten_cls_scores,
+                    flatten_bbox_preds,
+                    all_targets,
+                    flatten_anchors,
+                    sampling_results_list,
+                    bbox_coder=self.bbox_coder,
+                    loss_cls=self.loss_cls,
+                    num_class=self.num_classes,
+                    **self.train_cfg.isr)
+            (flatten_labels, flatten_label_weights, flatten_bbox_targets,
+             flatten_bbox_weights) = all_targets
+
+        # For convenience we compute loss once instead separating by fpn level,
+        # so that we don't need to separate the weights by level again.
+        # The result should be the same
+        losses_cls = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels,
+            flatten_label_weights,
+            avg_factor=num_total_samples)
+        losses_bbox = self.loss_bbox(
+            flatten_bbox_preds,
+            flatten_bbox_targets,
+            flatten_bbox_weights,
+            avg_factor=num_total_samples)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+        # CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                flatten_cls_scores,
+                flatten_labels,
+                flatten_bbox_preds,
+                flatten_bbox_targets,
+                self.loss_bbox,
+                **self.train_cfg.carl,
+                avg_factor=num_total_pos,
+                sigmoid=True,
+                num_class=self.num_classes)
+            loss_dict.update(loss_carl)
+
+        return loss_dict
diff --git a/mmdet/models/dense_heads/pisa_ssd_head.py b/mmdet/models/dense_heads/pisa_ssd_head.py
new file mode 100755
index 0000000..86b67ab
--- /dev/null
+++ b/mmdet/models/dense_heads/pisa_ssd_head.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import multi_apply
+from ..builder import HEADS
+from ..losses import CrossEntropyLoss, SmoothL1Loss, carl_loss, isr_p
+from .ssd_head import SSDHead
+
+
+# TODO: add loss evaluator for SSD
+@HEADS.register_module()
+class PISASSDHead(SSDHead):
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image
+                with shape (num_obj, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each image
+                with shape (num_obj, 4).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor]): Ignored gt bboxes of each image.
+                Default: None.
+
+        Returns:
+            dict: Loss dict, comprise classification loss regression loss and
+                carl loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=1,
+            unmap_outputs=False,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg, sampling_results_list) = cls_reg_targets
+
+        num_images = len(img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        isr_cfg = self.train_cfg.get('isr', None)
+        all_targets = (all_labels.view(-1), all_label_weights.view(-1),
+                       all_bbox_targets.view(-1,
+                                             4), all_bbox_weights.view(-1, 4))
+        # apply ISR-P
+        if isr_cfg is not None:
+            all_targets = isr_p(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_bbox_preds.view(-1, 4),
+                all_targets,
+                torch.cat(all_anchors),
+                sampling_results_list,
+                loss_cls=CrossEntropyLoss(),
+                bbox_coder=self.bbox_coder,
+                **self.train_cfg.isr,
+                num_class=self.num_classes)
+            (new_labels, new_label_weights, new_bbox_targets,
+             new_bbox_weights) = all_targets
+            all_labels = new_labels.view(all_labels.shape)
+            all_label_weights = new_label_weights.view(all_label_weights.shape)
+            all_bbox_targets = new_bbox_targets.view(all_bbox_targets.shape)
+            all_bbox_weights = new_bbox_weights.view(all_bbox_weights.shape)
+
+        # add CARL loss
+        carl_loss_cfg = self.train_cfg.get('carl', None)
+        if carl_loss_cfg is not None:
+            loss_carl = carl_loss(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_targets[0],
+                all_bbox_preds.view(-1, 4),
+                all_targets[2],
+                SmoothL1Loss(beta=1.),
+                **self.train_cfg.carl,
+                avg_factor=num_total_pos,
+                num_class=self.num_classes)
+
+        # check NaN and Inf
+        assert torch.isfinite(all_cls_scores).all().item(), \
+            'classification scores become infinite or NaN!'
+        assert torch.isfinite(all_bbox_preds).all().item(), \
+            'bbox predications become infinite or NaN!'
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            num_total_samples=num_total_pos)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        if carl_loss_cfg is not None:
+            loss_dict.update(loss_carl)
+        return loss_dict
diff --git a/mmdet/models/dense_heads/reppoints_head.py b/mmdet/models/dense_heads/reppoints_head.py
new file mode 100755
index 0000000..f720414
--- /dev/null
+++ b/mmdet/models/dense_heads/reppoints_head.py
@@ -0,0 +1,764 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+
+from mmdet.core import (build_assigner, build_sampler, images_to_levels,
+                        multi_apply, unmap)
+from mmdet.core.anchor.point_generator import MlvlPointGenerator
+from mmdet.core.utils import filter_scores_and_topk
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+
+@HEADS.register_module()
+class RepPointsHead(AnchorFreeHead):
+    """RepPoint head.
+
+    Args:
+        point_feat_channels (int): Number of channels of points features.
+        gradient_mul (float): The multiplier to gradients from
+            points refinement and recognition.
+        point_strides (Iterable): points strides.
+        point_base_scale (int): bbox scale for assigning labels.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_init (dict): Config of initial points loss.
+        loss_bbox_refine (dict): Config of points loss in refinement.
+        use_grid_points (bool): If we use bounding box representation, the
+        reppoints is represented as grid points on the bounding box.
+        center_init (bool): Whether to use center point assignment.
+        transform_method (str): The methods to transform RepPoints to bbox.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 point_feat_channels=256,
+                 num_points=9,
+                 gradient_mul=0.1,
+                 point_strides=[8, 16, 32, 64, 128],
+                 point_base_scale=4,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox_init=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5),
+                 loss_bbox_refine=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 use_grid_points=False,
+                 center_init=True,
+                 transform_method='moment',
+                 moment_mul=0.01,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='reppoints_cls_out',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        self.num_points = num_points
+        self.point_feat_channels = point_feat_channels
+        self.use_grid_points = use_grid_points
+        self.center_init = center_init
+
+        # we use deform conv to extract points features
+        self.dcn_kernel = int(np.sqrt(num_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        assert self.dcn_kernel * self.dcn_kernel == num_points, \
+            'The points number should be a square number.'
+        assert self.dcn_kernel % 2 == 1, \
+            'The points number should be an odd square number.'
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.gradient_mul = gradient_mul
+        self.point_base_scale = point_base_scale
+        self.point_strides = point_strides
+        self.prior_generator = MlvlPointGenerator(
+            self.point_strides, offset=0.)
+
+        self.sampling = loss_cls['type'] not in ['FocalLoss']
+        if self.train_cfg:
+            self.init_assigner = build_assigner(self.train_cfg.init.assigner)
+            self.refine_assigner = build_assigner(
+                self.train_cfg.refine.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.transform_method = transform_method
+        if self.transform_method == 'moment':
+            self.moment_transfer = nn.Parameter(
+                data=torch.zeros(2), requires_grad=True)
+            self.moment_mul = moment_mul
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+        self.loss_bbox_init = build_loss(loss_bbox_init)
+        self.loss_bbox_refine = build_loss(loss_bbox_refine)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points
+        self.reppoints_cls_conv = DeformConv2d(self.feat_channels,
+                                               self.point_feat_channels,
+                                               self.dcn_kernel, 1,
+                                               self.dcn_pad)
+        self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels,
+                                           self.cls_out_channels, 1, 1, 0)
+        self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels,
+                                                 self.point_feat_channels, 3,
+                                                 1, 1)
+        self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels,
+                                                pts_out_dim, 1, 1, 0)
+        self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels,
+                                                      self.point_feat_channels,
+                                                      self.dcn_kernel, 1,
+                                                      self.dcn_pad)
+        self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels,
+                                                  pts_out_dim, 1, 1, 0)
+
+    def points2bbox(self, pts, y_first=True):
+        """Converting the points set into bounding box.
+
+        :param pts: the input points sets (fields), each points
+            set (fields) is represented as 2n scalar.
+        :param y_first: if y_first=True, the point set is represented as
+            [y1, x1, y2, x2 ... yn, xn], otherwise the point set is
+            represented as [x1, y1, x2, y2 ... xn, yn].
+        :return: each points set is converting to a bbox [x1, y1, x2, y2].
+        """
+        pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:])
+        pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1,
+                                                                      ...]
+        pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0,
+                                                                      ...]
+        if self.transform_method == 'minmax':
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'partial_minmax':
+            pts_y = pts_y[:, :4, ...]
+            pts_x = pts_x[:, :4, ...]
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'moment':
+            pts_y_mean = pts_y.mean(dim=1, keepdim=True)
+            pts_x_mean = pts_x.mean(dim=1, keepdim=True)
+            pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True)
+            pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True)
+            moment_transfer = (self.moment_transfer * self.moment_mul) + (
+                self.moment_transfer.detach() * (1 - self.moment_mul))
+            moment_width_transfer = moment_transfer[0]
+            moment_height_transfer = moment_transfer[1]
+            half_width = pts_x_std * torch.exp(moment_width_transfer)
+            half_height = pts_y_std * torch.exp(moment_height_transfer)
+            bbox = torch.cat([
+                pts_x_mean - half_width, pts_y_mean - half_height,
+                pts_x_mean + half_width, pts_y_mean + half_height
+            ],
+                             dim=1)
+        else:
+            raise NotImplementedError
+        return bbox
+
+    def gen_grid_from_reg(self, reg, previous_boxes):
+        """Base on the previous bboxes and regression values, we compute the
+        regressed bboxes and generate the grids on the bboxes.
+
+        :param reg: the regression value to previous bboxes.
+        :param previous_boxes: previous bboxes.
+        :return: generate grids on the regressed bboxes.
+        """
+        b, _, h, w = reg.shape
+        bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2.
+        bwh = (previous_boxes[:, 2:, ...] -
+               previous_boxes[:, :2, ...]).clamp(min=1e-6)
+        grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp(
+            reg[:, 2:, ...])
+        grid_wh = bwh * torch.exp(reg[:, 2:, ...])
+        grid_left = grid_topleft[:, [0], ...]
+        grid_top = grid_topleft[:, [1], ...]
+        grid_width = grid_wh[:, [0], ...]
+        grid_height = grid_wh[:, [1], ...]
+        intervel = torch.linspace(0., 1., self.dcn_kernel).view(
+            1, self.dcn_kernel, 1, 1).type_as(reg)
+        grid_x = grid_left + grid_width * intervel
+        grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1)
+        grid_x = grid_x.view(b, -1, h, w)
+        grid_y = grid_top + grid_height * intervel
+        grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1)
+        grid_y = grid_y.view(b, -1, h, w)
+        grid_yx = torch.stack([grid_y, grid_x], dim=2)
+        grid_yx = grid_yx.view(b, -1, h, w)
+        regressed_bbox = torch.cat([
+            grid_left, grid_top, grid_left + grid_width, grid_top + grid_height
+        ], 1)
+        return grid_yx, regressed_bbox
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def forward_single(self, x):
+        """Forward feature map of a single FPN level."""
+        dcn_base_offset = self.dcn_base_offset.type_as(x)
+        # If we use center_init, the initial reppoints is from center points.
+        # If we use bounding bbox representation, the initial reppoints is
+        #   from regular grid placed on a pre-defined bbox.
+        if self.use_grid_points or not self.center_init:
+            scale = self.point_base_scale / 2
+            points_init = dcn_base_offset / dcn_base_offset.max() * scale
+            bbox_init = x.new_tensor([-scale, -scale, scale,
+                                      scale]).view(1, 4, 1, 1)
+        else:
+            points_init = 0
+        cls_feat = x
+        pts_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            pts_feat = reg_conv(pts_feat)
+        # initialize reppoints
+        pts_out_init = self.reppoints_pts_init_out(
+            self.relu(self.reppoints_pts_init_conv(pts_feat)))
+        if self.use_grid_points:
+            pts_out_init, bbox_out_init = self.gen_grid_from_reg(
+                pts_out_init, bbox_init.detach())
+        else:
+            pts_out_init = pts_out_init + points_init
+        # refine and classify reppoints
+        pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach(
+        ) + self.gradient_mul * pts_out_init
+        dcn_offset = pts_out_init_grad_mul - dcn_base_offset
+        cls_out = self.reppoints_cls_out(
+            self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset)))
+        pts_out_refine = self.reppoints_pts_refine_out(
+            self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset)))
+        if self.use_grid_points:
+            pts_out_refine, bbox_out_refine = self.gen_grid_from_reg(
+                pts_out_refine, bbox_out_init.detach())
+        else:
+            pts_out_refine = pts_out_refine + pts_out_init.detach()
+
+        if self.training:
+            return cls_out, pts_out_init, pts_out_refine
+        else:
+            return cls_out, self.points2bbox(pts_out_refine)
+
+    def get_points(self, featmap_sizes, img_metas, device):
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: points of each image, valid flags of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # points center for one time
+        multi_level_points = self.prior_generator.grid_priors(
+            featmap_sizes, device=device, with_stride=True)
+        points_list = [[point.clone() for point in multi_level_points]
+                       for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level grids
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'])
+            valid_flag_list.append(multi_level_flags)
+
+        return points_list, valid_flag_list
+
+    def centers_to_bboxes(self, point_list):
+        """Get bboxes according to center points.
+
+        Only used in :class:`MaxIoUAssigner`.
+        """
+        bbox_list = []
+        for i_img, point in enumerate(point_list):
+            bbox = []
+            for i_lvl in range(len(self.point_strides)):
+                scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5
+                bbox_shift = torch.Tensor([-scale, -scale, scale,
+                                           scale]).view(1, 4).type_as(point[0])
+                bbox_center = torch.cat(
+                    [point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center + bbox_shift)
+            bbox_list.append(bbox)
+        return bbox_list
+
+    def offset_to_pts(self, center_list, pred_list):
+        """Change from point offset to point coordinate."""
+        pts_list = []
+        for i_lvl in range(len(self.point_strides)):
+            pts_lvl = []
+            for i_img in range(len(center_list)):
+                pts_center = center_list[i_img][i_lvl][:, :2].repeat(
+                    1, self.num_points)
+                pts_shift = pred_list[i_lvl][i_img]
+                yx_pts_shift = pts_shift.permute(1, 2, 0).view(
+                    -1, 2 * self.num_points)
+                y_pts_shift = yx_pts_shift[..., 0::2]
+                x_pts_shift = yx_pts_shift[..., 1::2]
+                xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1)
+                xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1)
+                pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center
+                pts_lvl.append(pts)
+            pts_lvl = torch.stack(pts_lvl, 0)
+            pts_list.append(pts_lvl)
+        return pts_list
+
+    def _point_target_single(self,
+                             flat_proposals,
+                             valid_flags,
+                             gt_bboxes,
+                             gt_bboxes_ignore,
+                             gt_labels,
+                             stage='init',
+                             unmap_outputs=True):
+        inside_flags = valid_flags
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample proposals
+        proposals = flat_proposals[inside_flags, :]
+
+        if stage == 'init':
+            assigner = self.init_assigner
+            pos_weight = self.train_cfg.init.pos_weight
+        else:
+            assigner = self.refine_assigner
+            pos_weight = self.train_cfg.refine.pos_weight
+        assign_result = assigner.assign(proposals, gt_bboxes, gt_bboxes_ignore,
+                                        None if self.sampling else gt_labels)
+        sampling_result = self.sampler.sample(assign_result, proposals,
+                                              gt_bboxes)
+
+        num_valid_proposals = proposals.shape[0]
+        bbox_gt = proposals.new_zeros([num_valid_proposals, 4])
+        pos_proposals = torch.zeros_like(proposals)
+        proposals_weights = proposals.new_zeros([num_valid_proposals, 4])
+        labels = proposals.new_full((num_valid_proposals, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        label_weights = proposals.new_zeros(
+            num_valid_proposals, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_gt_bboxes = sampling_result.pos_gt_bboxes
+            bbox_gt[pos_inds, :] = pos_gt_bboxes
+            pos_proposals[pos_inds, :] = proposals[pos_inds, :]
+            proposals_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of proposals
+        if unmap_outputs:
+            num_total_proposals = flat_proposals.size(0)
+            labels = unmap(labels, num_total_proposals, inside_flags)
+            label_weights = unmap(label_weights, num_total_proposals,
+                                  inside_flags)
+            bbox_gt = unmap(bbox_gt, num_total_proposals, inside_flags)
+            pos_proposals = unmap(pos_proposals, num_total_proposals,
+                                  inside_flags)
+            proposals_weights = unmap(proposals_weights, num_total_proposals,
+                                      inside_flags)
+
+        return (labels, label_weights, bbox_gt, pos_proposals,
+                proposals_weights, pos_inds, neg_inds)
+
+    def get_targets(self,
+                    proposals_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    stage='init',
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            proposals_list (list[list]): Multi level points/bboxes of each
+                image.
+            valid_flag_list (list[list]): Multi level valid flags of each
+                image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_bboxes_list (list[Tensor]): Ground truth labels of each box.
+            stage (str): `init` or `refine`. Generate target for init stage or
+                refine stage
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each level.  # noqa: E501
+                - bbox_gt_list (list[Tensor]): Ground truth bbox of each level.
+                - proposal_list (list[Tensor]): Proposals(points/bboxes) of each level.  # noqa: E501
+                - proposal_weights_list (list[Tensor]): Proposal weights of each level.  # noqa: E501
+                - num_total_pos (int): Number of positive samples in all images.  # noqa: E501
+                - num_total_neg (int): Number of negative samples in all images.  # noqa: E501
+        """
+        assert stage in ['init', 'refine']
+        num_imgs = len(img_metas)
+        assert len(proposals_list) == len(valid_flag_list) == num_imgs
+
+        # points number of multi levels
+        num_level_proposals = [points.size(0) for points in proposals_list[0]]
+
+        # concat all level points and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(proposals_list[i]) == len(valid_flag_list[i])
+            proposals_list[i] = torch.cat(proposals_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_gt, all_proposals,
+         all_proposal_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._point_target_single,
+             proposals_list,
+             valid_flag_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             stage=stage,
+             unmap_outputs=unmap_outputs)
+        # no valid points
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled points of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        labels_list = images_to_levels(all_labels, num_level_proposals)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_proposals)
+        bbox_gt_list = images_to_levels(all_bbox_gt, num_level_proposals)
+        proposals_list = images_to_levels(all_proposals, num_level_proposals)
+        proposal_weights_list = images_to_levels(all_proposal_weights,
+                                                 num_level_proposals)
+        return (labels_list, label_weights_list, bbox_gt_list, proposals_list,
+                proposal_weights_list, num_total_pos, num_total_neg)
+
+    def loss_single(self, cls_score, pts_pred_init, pts_pred_refine, labels,
+                    label_weights, bbox_gt_init, bbox_weights_init,
+                    bbox_gt_refine, bbox_weights_refine, stride,
+                    num_total_samples_init, num_total_samples_refine):
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        cls_score = cls_score.contiguous()
+        loss_cls = self.loss_cls(
+            cls_score,
+            labels,
+            label_weights,
+            avg_factor=num_total_samples_refine)
+
+        # points loss
+        bbox_gt_init = bbox_gt_init.reshape(-1, 4)
+        bbox_weights_init = bbox_weights_init.reshape(-1, 4)
+        bbox_pred_init = self.points2bbox(
+            pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False)
+        bbox_gt_refine = bbox_gt_refine.reshape(-1, 4)
+        bbox_weights_refine = bbox_weights_refine.reshape(-1, 4)
+        bbox_pred_refine = self.points2bbox(
+            pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False)
+        normalize_term = self.point_base_scale * stride
+        loss_pts_init = self.loss_bbox_init(
+            bbox_pred_init / normalize_term,
+            bbox_gt_init / normalize_term,
+            bbox_weights_init,
+            avg_factor=num_total_samples_init)
+        loss_pts_refine = self.loss_bbox_refine(
+            bbox_pred_refine / normalize_term,
+            bbox_gt_refine / normalize_term,
+            bbox_weights_refine,
+            avg_factor=num_total_samples_refine)
+        return loss_cls, loss_pts_init, loss_pts_refine
+
+    def loss(self,
+             cls_scores,
+             pts_preds_init,
+             pts_preds_refine,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        # target for initial stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       img_metas, device)
+        pts_coordinate_preds_init = self.offset_to_pts(center_list,
+                                                       pts_preds_init)
+        if self.train_cfg.init.assigner['type'] == 'PointAssigner':
+            # Assign target for center list
+            candidate_list = center_list
+        else:
+            # transform center list to bbox list and
+            #   assign target for bbox list
+            bbox_list = self.centers_to_bboxes(center_list)
+            candidate_list = bbox_list
+        cls_reg_targets_init = self.get_targets(
+            candidate_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            stage='init',
+            label_channels=label_channels)
+        (*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init,
+         num_total_pos_init, num_total_neg_init) = cls_reg_targets_init
+        num_total_samples_init = (
+            num_total_pos_init +
+            num_total_neg_init if self.sampling else num_total_pos_init)
+
+        # target for refinement stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       img_metas, device)
+        pts_coordinate_preds_refine = self.offset_to_pts(
+            center_list, pts_preds_refine)
+        bbox_list = []
+        for i_img, center in enumerate(center_list):
+            bbox = []
+            for i_lvl in range(len(pts_preds_refine)):
+                bbox_preds_init = self.points2bbox(
+                    pts_preds_init[i_lvl].detach())
+                bbox_shift = bbox_preds_init * self.point_strides[i_lvl]
+                bbox_center = torch.cat(
+                    [center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center +
+                            bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4))
+            bbox_list.append(bbox)
+        cls_reg_targets_refine = self.get_targets(
+            bbox_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            stage='refine',
+            label_channels=label_channels)
+        (labels_list, label_weights_list, bbox_gt_list_refine,
+         candidate_list_refine, bbox_weights_list_refine, num_total_pos_refine,
+         num_total_neg_refine) = cls_reg_targets_refine
+        num_total_samples_refine = (
+            num_total_pos_refine +
+            num_total_neg_refine if self.sampling else num_total_pos_refine)
+
+        # compute loss
+        losses_cls, losses_pts_init, losses_pts_refine = multi_apply(
+            self.loss_single,
+            cls_scores,
+            pts_coordinate_preds_init,
+            pts_coordinate_preds_refine,
+            labels_list,
+            label_weights_list,
+            bbox_gt_list_init,
+            bbox_weights_list_init,
+            bbox_gt_list_refine,
+            bbox_weights_list_refine,
+            self.point_strides,
+            num_total_samples_init=num_total_samples_init,
+            num_total_samples_refine=num_total_samples_refine)
+        loss_dict_all = {
+            'loss_cls': losses_cls,
+            'loss_pts_init': losses_pts_init,
+            'loss_pts_refine': losses_pts_refine
+        }
+        return loss_dict_all
+
+    # Same as base_dense_head/_get_bboxes_single except self._bbox_decode
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_priors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. RepPoints head does not need
+                this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred,
+                                       self.point_strides[level_idx],
+                                       img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        return self._bbox_post_process(
+            mlvl_scores,
+            mlvl_labels,
+            mlvl_bboxes,
+            img_meta['scale_factor'],
+            cfg,
+            rescale=rescale,
+            with_nms=with_nms)
+
+    def _bbox_decode(self, points, bbox_pred, stride, max_shape):
+        bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1)
+        bboxes = bbox_pred * stride + bbox_pos_center
+        x1 = bboxes[:, 0].clamp(min=0, max=max_shape[1])
+        y1 = bboxes[:, 1].clamp(min=0, max=max_shape[0])
+        x2 = bboxes[:, 2].clamp(min=0, max=max_shape[1])
+        y2 = bboxes[:, 3].clamp(min=0, max=max_shape[0])
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        return decoded_bboxes
diff --git a/mmdet/models/dense_heads/retina_head.py b/mmdet/models/dense_heads/retina_head.py
new file mode 100755
index 0000000..a48720c
--- /dev/null
+++ b/mmdet/models/dense_heads/retina_head.py
@@ -0,0 +1,115 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class RetinaHead(AnchorHead):
+    r"""An anchor-based head used in `RetinaNet
+    <https://arxiv.org/pdf/1708.02002.pdf>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors.
+
+    Example:
+        >>> import torch
+        >>> self = RetinaHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == (self.num_classes)
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='retina_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(RetinaHead, self).__init__(
+            num_classes,
+            in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_pred = self.retina_reg(reg_feat)
+        return cls_score, bbox_pred
diff --git a/mmdet/models/dense_heads/retina_sepbn_head.py b/mmdet/models/dense_heads/retina_sepbn_head.py
new file mode 100755
index 0000000..b385c61
--- /dev/null
+++ b/mmdet/models/dense_heads/retina_sepbn_head.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class RetinaSepBNHead(AnchorHead):
+    """"RetinaHead with separate BN.
+
+    In RetinaHead, conv/norm layers are shared across different FPN levels,
+    while in RetinaSepBNHead, conv layers are shared across different FPN
+    levels, but BN layers are separated.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 num_ins,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.num_ins = num_ins
+        super(RetinaSepBNHead, self).__init__(
+            num_classes, in_channels, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+        for i in range(self.stacked_convs):
+            for j in range(1, self.num_ins):
+                self.cls_convs[j][i].conv = self.cls_convs[0][i].conv
+                self.reg_convs[j][i].conv = self.reg_convs[0][i].conv
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        super(RetinaSepBNHead, self).init_weights()
+        for m in self.cls_convs[0]:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs[0]:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for i, x in enumerate(feats):
+            cls_feat = feats[i]
+            reg_feat = feats[i]
+            for cls_conv in self.cls_convs[i]:
+                cls_feat = cls_conv(cls_feat)
+            for reg_conv in self.reg_convs[i]:
+                reg_feat = reg_conv(reg_feat)
+            cls_score = self.retina_cls(cls_feat)
+            bbox_pred = self.retina_reg(reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return cls_scores, bbox_preds
diff --git a/mmdet/models/dense_heads/rpn_head.py b/mmdet/models/dense_heads/rpn_head.py
new file mode 100755
index 0000000..54cd39a
--- /dev/null
+++ b/mmdet/models/dense_heads/rpn_head.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.ops import batched_nms
+
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class RPNHead(AnchorHead):
+    """RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+        num_convs (int): Number of convolution layers in the head. Default 1.
+    """  # noqa: W605
+
+    def __init__(self,
+                 in_channels,
+                 init_cfg=dict(type='Normal', layer='Conv2d', std=0.01),
+                 num_convs=1,
+                 **kwargs):
+        self.num_convs = num_convs
+        super(RPNHead, self).__init__(
+            1, in_channels, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        if self.num_convs > 1:
+            rpn_convs = []
+            for i in range(self.num_convs):
+                if i == 0:
+                    in_channels = self.in_channels
+                else:
+                    in_channels = self.feat_channels
+                # use ``inplace=False`` to avoid error: one of the variables
+                # needed for gradient computation has been modified by an
+                # inplace operation.
+                rpn_convs.append(
+                    ConvModule(
+                        in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        inplace=False))
+            self.rpn_conv = nn.Sequential(*rpn_convs)
+        else:
+            self.rpn_conv = nn.Conv2d(
+                self.in_channels, self.feat_channels, 3, padding=1)
+        self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                 self.num_base_priors * self.cls_out_channels,
+                                 1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_base_priors * 4,
+                                 1)
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=False)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        losses = super(RPNHead, self).loss(
+            cls_scores,
+            bbox_preds,
+            gt_bboxes,
+            None,
+            img_metas,
+            gt_bboxes_ignore=gt_bboxes_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox'])
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_anchors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_anchors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has
+                shape (num_anchors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. RPN head does not need this value.
+            mlvl_anchors (list[Tensor]): Anchors of all scale level
+                each item has shape (num_anchors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        nms_pre = cfg.get('nms_pre', -1)
+        for level_idx in range(len(cls_score_list)):
+            rpn_cls_score = cls_score_list[level_idx]
+            rpn_bbox_pred = bbox_pred_list[level_idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(dim=1)[:, 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            anchors = mlvl_anchors[level_idx]
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ),
+                                level_idx,
+                                dtype=torch.long))
+
+        return self._bbox_post_process(mlvl_scores, mlvl_bbox_preds,
+                                       mlvl_valid_anchors, level_ids, cfg,
+                                       img_shape)
+
+    def _bbox_post_process(self, mlvl_scores, mlvl_bboxes, mlvl_valid_anchors,
+                           level_ids, cfg, img_shape, **kwargs):
+        """bbox post-processing method.
+
+        Do the nms operation for bboxes in same level.
+
+        Args:
+            mlvl_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale
+                levels of a single image, each item has shape (num_bboxes, 4).
+            mlvl_valid_anchors (list[Tensor]): Anchors of all scale level
+                each item has shape (num_bboxes, 4).
+            level_ids (list[Tensor]): Indexes from all scale levels of a
+                single image, each item has shape (num_bboxes, ).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, `self.test_cfg` would be used.
+            img_shape (tuple(int)): The shape of model's input image.
+
+        Returns:
+            Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1.
+        """
+        scores = torch.cat(mlvl_scores)
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bboxes)
+        proposals = self.bbox_coder.decode(
+            anchors, rpn_bbox_pred, max_shape=img_shape)
+        ids = torch.cat(level_ids)
+
+        if cfg.min_bbox_size >= 0:
+            w = proposals[:, 2] - proposals[:, 0]
+            h = proposals[:, 3] - proposals[:, 1]
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                proposals = proposals[valid_mask]
+                scores = scores[valid_mask]
+                ids = ids[valid_mask]
+
+        if proposals.numel() > 0:
+            dets, _ = batched_nms(proposals, scores, ids, cfg.nms)
+        else:
+            return proposals.new_zeros(0, 5)
+
+        return dets[:cfg.max_per_img]
+
+    def onnx_export(self, x, img_metas):
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+        Returns:
+            Tensor: dets of shape [N, num_det, 5].
+        """
+        cls_scores, bbox_preds = self(x)
+
+        assert len(cls_scores) == len(bbox_preds)
+
+        batch_bboxes, batch_scores = super(RPNHead, self).onnx_export(
+            cls_scores, bbox_preds, img_metas=img_metas, with_nms=False)
+        # Use ONNX::NonMaxSuppression in deployment
+        from mmdet.core.export import add_dummy_nms_for_onnx
+        cfg = copy.deepcopy(self.test_cfg)
+        score_threshold = cfg.nms.get('score_thr', 0.0)
+        nms_pre = cfg.get('deploy_nms_pre', -1)
+        # Different from the normal forward doing NMS level by level,
+        # we do NMS across all levels when exporting ONNX.
+        dets, _ = add_dummy_nms_for_onnx(batch_bboxes, batch_scores,
+                                         cfg.max_per_img,
+                                         cfg.nms.iou_threshold,
+                                         score_threshold, nms_pre,
+                                         cfg.max_per_img)
+        return dets
diff --git a/mmdet/models/dense_heads/sabl_retina_head.py b/mmdet/models/dense_heads/sabl_retina_head.py
new file mode 100755
index 0000000..4fede71
--- /dev/null
+++ b/mmdet/models/dense_heads/sabl_retina_head.py
@@ -0,0 +1,630 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import force_fp32
+
+from mmdet.core import (build_assigner, build_bbox_coder,
+                        build_prior_generator, build_sampler, images_to_levels,
+                        multi_apply, unmap)
+from mmdet.core.utils import filter_scores_and_topk
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@HEADS.register_module()
+class SABLRetinaHead(BaseDenseHead, BBoxTestMixin):
+    """Side-Aware Boundary Localization (SABL) for RetinaNet.
+
+    The anchor generation, assigning and sampling in SABLRetinaHead
+    are the same as GuidedAnchorHead for guided anchoring.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of Convs for classification \
+            and regression branches. Defaults to 4.
+        feat_channels (int): Number of hidden channels. \
+            Defaults to 256.
+        approx_anchor_generator (dict): Config dict for approx generator.
+        square_anchor_generator (dict): Config dict for square generator.
+        conv_cfg (dict): Config dict for ConvModule. Defaults to None.
+        norm_cfg (dict): Config dict for Norm Layer. Defaults to None.
+        bbox_coder (dict): Config dict for bbox coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (dict): Training config of SABLRetinaHead.
+        test_cfg (dict): Testing config of SABLRetinaHead.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_cls (dict): Config of classification loss for bbox branch.
+        loss_bbox_reg (dict): Config of regression loss for bbox branch.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 feat_channels=256,
+                 approx_anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 square_anchor_generator=dict(
+                     type='AnchorGenerator',
+                     ratios=[1.0],
+                     scales=[4],
+                     strides=[8, 16, 32, 64, 128]),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 bbox_coder=dict(
+                     type='BucketingBBoxCoder',
+                     num_buckets=14,
+                     scale_factor=3.0),
+                 reg_decoded_bbox=False,
+                 train_cfg=None,
+                 test_cfg=None,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.5),
+                 loss_bbox_reg=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='retina_cls',
+                         std=0.01,
+                         bias_prob=0.01))):
+        super(SABLRetinaHead, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.num_buckets = bbox_coder['num_buckets']
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+
+        self.approx_anchor_generator = build_prior_generator(
+            approx_anchor_generator)
+        self.square_anchor_generator = build_prior_generator(
+            square_anchor_generator)
+        self.approxs_per_octave = (
+            self.approx_anchor_generator.num_base_priors[0])
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.sampling = loss_cls['type'] not in [
+            'FocalLoss', 'GHMC', 'QualityFocalLoss'
+        ]
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox_cls = build_loss(loss_bbox_cls)
+        self.loss_bbox_reg = build_loss(loss_bbox_reg)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.fp16_enabled = False
+        self._init_layers()
+
+    @property
+    def num_anchors(self):
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'please use "num_base_priors" instead')
+        return self.square_anchor_generator.num_base_priors[0]
+
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.retina_bbox_reg = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+        self.retina_bbox_cls = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+
+    def forward_single(self, x):
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_cls_pred = self.retina_bbox_cls(reg_feat)
+        bbox_reg_pred = self.retina_bbox_reg(reg_feat)
+        bbox_pred = (bbox_cls_pred, bbox_reg_pred)
+        return cls_score, bbox_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        return squares_list
+
+    def get_target(self,
+                   approx_list,
+                   inside_flag_list,
+                   square_list,
+                   gt_bboxes_list,
+                   img_metas,
+                   gt_bboxes_ignore_list=None,
+                   gt_labels_list=None,
+                   label_channels=None,
+                   sampling=True,
+                   unmap_outputs=True):
+        """Compute bucketing targets.
+        Args:
+            approx_list (list[list]): Multi level approxs of each image.
+            inside_flag_list (list[list]): Multi level inside flags of each
+                image.
+            square_list (list[list]): Multi level squares of each image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
+            gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
+            label_channels (int): Channel of label.
+            sampling (bool): Sample Anchors or not.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple: Returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each \
+                    level.
+                - bbox_cls_targets_list (list[Tensor]): BBox cls targets of \
+                    each level.
+                - bbox_cls_weights_list (list[Tensor]): BBox cls weights of \
+                    each level.
+                - bbox_reg_targets_list (list[Tensor]): BBox reg targets of \
+                    each level.
+                - bbox_reg_weights_list (list[Tensor]): BBox reg weights of \
+                    each level.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        num_imgs = len(img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_cls_targets,
+         all_bbox_cls_weights, all_bbox_reg_targets, all_bbox_reg_weights,
+         pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             sampling=sampling,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_squares)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_squares)
+        bbox_cls_targets_list = images_to_levels(all_bbox_cls_targets,
+                                                 num_level_squares)
+        bbox_cls_weights_list = images_to_levels(all_bbox_cls_weights,
+                                                 num_level_squares)
+        bbox_reg_targets_list = images_to_levels(all_bbox_reg_targets,
+                                                 num_level_squares)
+        bbox_reg_weights_list = images_to_levels(all_bbox_reg_weights,
+                                                 num_level_squares)
+        return (labels_list, label_weights_list, bbox_cls_targets_list,
+                bbox_cls_weights_list, bbox_reg_targets_list,
+                bbox_reg_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self,
+                           flat_approxs,
+                           inside_flags,
+                           flat_squares,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=None,
+                           sampling=True,
+                           unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_bboxes (Tensor): Ground truth bboxes of a single image, \
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            sampling (bool): Sample Anchors or not.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple:
+
+                - labels_list (Tensor): Labels in a single image
+                - label_weights (Tensor): Label weights in a single image
+                - bbox_cls_targets (Tensor): BBox cls targets in a single image
+                - bbox_cls_weights (Tensor): BBox cls weights in a single image
+                - bbox_reg_targets (Tensor): BBox reg targets in a single image
+                - bbox_reg_weights (Tensor): BBox reg weights in a single image
+                - num_total_pos (int): Number of positive samples \
+                    in a single image
+                - num_total_neg (int): Number of negative samples \
+                    in a single image
+        """
+        if not inside_flags.any():
+            return (None, ) * 8
+        # assign gt and sample anchors
+        expand_inside_flags = inside_flags[:, None].expand(
+            -1, self.approxs_per_octave).reshape(-1)
+        approxs = flat_approxs[expand_inside_flags, :]
+        squares = flat_squares[inside_flags, :]
+
+        assign_result = self.assigner.assign(approxs, squares,
+                                             self.approxs_per_octave,
+                                             gt_bboxes, gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, squares,
+                                              gt_bboxes)
+
+        num_valid_squares = squares.shape[0]
+        bbox_cls_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_cls_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        labels = squares.new_full((num_valid_squares, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = squares.new_zeros(num_valid_squares, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            (pos_bbox_reg_targets, pos_bbox_reg_weights, pos_bbox_cls_targets,
+             pos_bbox_cls_weights) = self.bbox_coder.encode(
+                 sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+
+            bbox_cls_targets[pos_inds, :] = pos_bbox_cls_targets
+            bbox_reg_targets[pos_inds, :] = pos_bbox_reg_targets
+            bbox_cls_weights[pos_inds, :] = pos_bbox_cls_weights
+            bbox_reg_weights[pos_inds, :] = pos_bbox_reg_weights
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_cls_targets = unmap(bbox_cls_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_cls_weights = unmap(bbox_cls_weights, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_targets = unmap(bbox_reg_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_weights = unmap(bbox_reg_weights, num_total_anchors,
+                                     inside_flags)
+        return (labels, label_weights, bbox_cls_targets, bbox_cls_weights,
+                bbox_reg_targets, bbox_reg_weights, pos_inds, neg_inds)
+
+    def loss_single(self, cls_score, bbox_pred, labels, label_weights,
+                    bbox_cls_targets, bbox_cls_weights, bbox_reg_targets,
+                    bbox_reg_weights, num_total_samples):
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_cls_targets = bbox_cls_targets.reshape(-1, self.side_num * 4)
+        bbox_cls_weights = bbox_cls_weights.reshape(-1, self.side_num * 4)
+        bbox_reg_targets = bbox_reg_targets.reshape(-1, self.side_num * 4)
+        bbox_reg_weights = bbox_reg_weights.reshape(-1, self.side_num * 4)
+        (bbox_cls_pred, bbox_reg_pred) = bbox_pred
+        bbox_cls_pred = bbox_cls_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        bbox_reg_pred = bbox_reg_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        loss_bbox_cls = self.loss_bbox_cls(
+            bbox_cls_pred,
+            bbox_cls_targets.long(),
+            bbox_cls_weights,
+            avg_factor=num_total_samples * 4 * self.side_num)
+        loss_bbox_reg = self.loss_bbox_reg(
+            bbox_reg_pred,
+            bbox_reg_targets,
+            bbox_reg_weights,
+            avg_factor=num_total_samples * 4 * self.bbox_coder.offset_topk)
+        return loss_cls, loss_bbox_cls, loss_bbox_reg
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = GuidedAnchorHead.get_sampled_approxs(
+            self, featmap_sizes, img_metas, device=device)
+
+        square_list = self.get_anchors(featmap_sizes, img_metas, device=device)
+
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_target(
+            approxs_list,
+            inside_flag_list,
+            square_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            sampling=self.sampling)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_cls_targets_list,
+         bbox_cls_weights_list, bbox_reg_targets_list, bbox_reg_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+        losses_cls, losses_bbox_cls, losses_bbox_reg = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_cls_targets_list,
+            bbox_cls_weights_list,
+            bbox_reg_targets_list,
+            bbox_reg_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox_cls=losses_bbox_cls,
+            loss_bbox_reg=losses_bbox_reg)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False):
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+
+        device = cls_scores[0].device
+        mlvl_anchors = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_cls_pred_list = [
+                bbox_preds[i][0][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_reg_pred_list = [
+                bbox_preds[i][1][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(
+                cls_score_list, bbox_cls_pred_list, bbox_reg_pred_list,
+                mlvl_anchors[img_id], img_shape, scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_cls_preds,
+                           bbox_reg_preds,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_confids = []
+        mlvl_labels = []
+        assert len(cls_scores) == len(bbox_cls_preds) == len(
+            bbox_reg_preds) == len(mlvl_anchors)
+        for cls_score, bbox_cls_pred, bbox_reg_pred, anchors in zip(
+                cls_scores, bbox_cls_preds, bbox_reg_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_cls_pred.size(
+            )[-2:] == bbox_reg_pred.size()[-2::]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+            bbox_cls_pred = bbox_cls_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+            bbox_reg_pred = bbox_reg_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(
+                    anchors=anchors,
+                    bbox_cls_pred=bbox_cls_pred,
+                    bbox_reg_pred=bbox_reg_pred))
+            scores, labels, _, filtered_results = results
+
+            anchors = filtered_results['anchors']
+            bbox_cls_pred = filtered_results['bbox_cls_pred']
+            bbox_reg_pred = filtered_results['bbox_reg_pred']
+
+            bbox_preds = [
+                bbox_cls_pred.contiguous(),
+                bbox_reg_pred.contiguous()
+            ]
+            bboxes, confids = self.bbox_coder.decode(
+                anchors.contiguous(), bbox_preds, max_shape=img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_confids.append(confids)
+            mlvl_labels.append(labels)
+        return self._bbox_post_process(mlvl_scores, mlvl_labels, mlvl_bboxes,
+                                       scale_factor, cfg, rescale, True,
+                                       mlvl_confids)
diff --git a/mmdet/models/dense_heads/solo_head.py b/mmdet/models/dense_heads/solo_head.py
new file mode 100755
index 0000000..e89aacb
--- /dev/null
+++ b/mmdet/models/dense_heads/solo_head.py
@@ -0,0 +1,1197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmdet.core import InstanceData, mask_matrix_nms, multi_apply
+from mmdet.core.utils import center_of_mass, generate_coordinate
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.utils.misc import floordiv
+from .base_mask_head import BaseMaskHead
+
+
+@HEADS.register_module()
+class SOLOHead(BaseMaskHead):
+    """SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Default: 256.
+        stacked_convs (int): Number of stacking convs of the head.
+            Default: 4.
+        strides (tuple): Downsample factor of each feature map.
+        scale_ranges (tuple[tuple[int, int]]): Area range of multiple
+            level masks, in the format [(min1, max1), (min2, max2), ...].
+            A range of (16, 64) means the area range between (16, 64).
+        pos_scale (float): Constant scale factor to control the center region.
+        num_grids (list[int]): Divided image into a uniform grids, each
+            feature map has a different grid value. The number of output
+            channels is grid ** 2. Default: [40, 36, 24, 16, 12].
+        cls_down_index (int): The index of downsample operation in
+            classification branch. Default: 0.
+        loss_mask (dict): Config of mask loss.
+        loss_cls (dict): Config of classification loss.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32,
+                                   requires_grad=True).
+        train_cfg (dict): Training config of head.
+        test_cfg (dict): Testing config of head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes,
+        in_channels,
+        feat_channels=256,
+        stacked_convs=4,
+        strides=(4, 8, 16, 32, 64),
+        scale_ranges=((8, 32), (16, 64), (32, 128), (64, 256), (128, 512)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=None,
+        loss_cls=None,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        train_cfg=None,
+        test_cfg=None,
+        init_cfg=[
+            dict(type='Normal', layer='Conv2d', std=0.01),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_mask_list')),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_cls'))
+        ],
+    ):
+        super(SOLOHead, self).__init__(init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = self.num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.num_grids = num_grids
+        # number of FPN feats
+        self.num_levels = len(strides)
+        assert self.num_levels == len(scale_ranges) == len(num_grids)
+        self.scale_ranges = scale_ranges
+        self.pos_scale = pos_scale
+
+        self.cls_down_index = cls_down_index
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.norm_cfg = norm_cfg
+        self.init_cfg = init_cfg
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers()
+
+    def _init_layers(self):
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.conv_mask_list = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list.append(
+                nn.Conv2d(self.feat_channels, num_grid**2, 1))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def resize_feats(self, feats):
+        """Downsample the first feat and upsample last feat in feats."""
+        out = []
+        for i in range(len(feats)):
+            if i == 0:
+                out.append(
+                    F.interpolate(
+                        feats[0],
+                        size=feats[i + 1].shape[-2:],
+                        mode='bilinear',
+                        align_corners=False))
+            elif i == len(feats) - 1:
+                out.append(
+                    F.interpolate(
+                        feats[i],
+                        size=feats[i - 1].shape[-2:],
+                        mode='bilinear',
+                        align_corners=False))
+            else:
+                out.append(feats[i])
+        return out
+
+    def forward(self, feats):
+        assert len(feats) == self.num_levels
+        feats = self.resize_feats(feats)
+        mlvl_mask_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in (self.mask_convs):
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+            mask_pred = self.conv_mask_list[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred = F.interpolate(
+                    mask_pred.sigmoid(), size=upsampled_size, mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mlvl_mask_preds.append(mask_pred)
+            mlvl_cls_preds.append(cls_pred)
+        return mlvl_mask_preds, mlvl_cls_preds
+
+    def loss(self,
+             mlvl_mask_preds,
+             mlvl_cls_preds,
+             gt_labels,
+             gt_masks,
+             img_metas,
+             gt_bboxes=None,
+             **kwargs):
+        """Calculate the loss of total batch.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            gt_labels (list[Tensor]): Labels of multiple images.
+            gt_masks (list[Tensor]): Ground truth masks of multiple images.
+                Each has shape (num_instances, h, w).
+            img_metas (list[dict]): Meta information of multiple images.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of multiple
+                images. Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(gt_labels)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds]
+
+        # `BoolTensor` in `pos_masks` represent
+        # whether the corresponding point is
+        # positive
+        pos_mask_targets, labels, pos_masks = multi_apply(
+            self._get_targets_single,
+            gt_bboxes,
+            gt_labels,
+            gt_masks,
+            featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds = [[] for _ in range(num_levels)]
+        mlvl_pos_masks = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+            assert num_levels == len(pos_mask_targets[img_id])
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds[lvl].append(
+                    mlvl_mask_preds[lvl][img_id, pos_masks[img_id][lvl], ...])
+                mlvl_pos_masks[lvl].append(pos_masks[img_id][lvl].flatten())
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds[lvl] = torch.cat(
+                mlvl_pos_mask_preds[lvl], dim=0)
+            mlvl_pos_masks[lvl] = torch.cat(mlvl_pos_masks[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = sum(item.sum() for item in mlvl_pos_masks)
+        # dice loss
+        loss_mask = []
+        for pred, target in zip(mlvl_pos_mask_preds, mlvl_pos_mask_targets):
+            if pred.size()[0] == 0:
+                loss_mask.append(pred.sum().unsqueeze(0))
+                continue
+            loss_mask.append(
+                self.loss_mask(pred, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_bboxes,
+                            gt_labels,
+                            gt_masks,
+                            featmap_sizes=None):
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth bbox of each instance,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth label of each instance,
+                shape (num_gts,).
+            gt_masks (Tensor): Ground truth mask of each instance,
+                shape (num_gts, h, w).
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Default: None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+        """
+        device = gt_labels.device
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+
+        mlvl_pos_mask_targets = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), stride, featmap_size, num_grid \
+                in zip(self.scale_ranges, self.strides,
+                       featmap_sizes, self.num_grids):
+
+            mask_target = torch.zeros(
+                [num_grid**2, featmap_size[0], featmap_size[1]],
+                dtype=torch.uint8,
+                device=device)
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    mask_target.new_zeros(0, featmap_size[0], featmap_size[1]))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+            output_stride = stride / 2
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_sizes[0][0] * 4,
+                                  featmap_sizes[0][1] * 4)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / output_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        mask_target[index, :gt_mask.shape[0], :gt_mask.
+                                    shape[1]] = gt_mask
+                        pos_mask[index] = True
+            mlvl_pos_mask_targets.append(mask_target[pos_mask])
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks
+
+    def get_results(self, mlvl_mask_preds, mlvl_cls_scores, img_metas,
+                    **kwargs):
+        """Get multi-image mask results.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            mask_pred_list = [
+                mlvl_mask_preds[lvl][img_id] for lvl in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list = torch.cat(mask_pred_list, dim=0)
+
+            results = self._get_results_single(
+                cls_pred_list, mask_pred_list, img_meta=img_metas[img_id])
+            results_list.append(results)
+
+        return results_list
+
+    def _get_results_single(self, cls_scores, mask_preds, img_meta, cfg=None):
+        """Get processed mask related results of single image.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Default: None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(results, cls_scores):
+            """Generate a empty results."""
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *results.ori_shape[:2])
+            results.labels = cls_scores.new_ones(0)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(mask_preds)
+        results = InstanceData(img_meta)
+
+        featmap_size = mask_preds.size()[-2:]
+
+        img_shape = results.img_shape
+        ori_shape = results.ori_shape
+
+        h, w, _ = img_shape
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(results, cls_scores)
+
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+
+        # Filter the mask mask with an area is smaller than
+        # stride of corresponding feature level
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = cls_scores.new_ones(lvl_interval[-1])
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+        mask_preds = mask_preds[inds[:, 0]]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(results, cls_scores)
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=ori_shape[:2], mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+
+        return results
+
+
+@HEADS.register_module()
+class DecoupledSOLOHead(SOLOHead):
+    """Decoupled SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 init_cfg=[
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs):
+        super(DecoupledSOLOHead, self).__init__(
+            *args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        self.mask_convs_x = nn.ModuleList()
+        self.mask_convs_y = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 1 if i == 0 else self.feat_channels
+            self.mask_convs_x.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            self.mask_convs_y.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, feats):
+        assert len(feats) == self.num_levels
+        feats = self.resize_feats(feats)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat_x = torch.cat([mask_feat, coord_feat[:, 0:1, ...]], 1)
+            mask_feat_y = torch.cat([mask_feat, coord_feat[:, 1:2, ...]], 1)
+
+            for mask_layer_x, mask_layer_y in \
+                    zip(self.mask_convs_x, self.mask_convs_y):
+                mask_feat_x = mask_layer_x(mask_feat_x)
+                mask_feat_y = mask_layer_y(mask_feat_y)
+
+            mask_feat_x = F.interpolate(
+                mask_feat_x, scale_factor=2, mode='bilinear')
+            mask_feat_y = F.interpolate(
+                mask_feat_y, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat_x)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat_y)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
+
+    def loss(self,
+             mlvl_mask_preds_x,
+             mlvl_mask_preds_y,
+             mlvl_cls_preds,
+             gt_labels,
+             gt_masks,
+             img_metas,
+             gt_bboxes=None,
+             **kwargs):
+        """Calculate the loss of total batch.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            gt_labels (list[Tensor]): Labels of multiple images.
+            gt_masks (list[Tensor]): Ground truth masks of multiple images.
+                Each has shape (num_instances, h, w).
+            img_metas (list[dict]): Meta information of multiple images.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of multiple
+                images. Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(gt_labels)
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds_x]
+
+        pos_mask_targets, labels, \
+            xy_pos_indexes = \
+            multi_apply(self._get_targets_single,
+                        gt_bboxes,
+                        gt_labels,
+                        gt_masks,
+                        featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_x = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_y = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds_x[lvl].append(
+                    mlvl_mask_preds_x[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 1]])
+                mlvl_pos_mask_preds_y[lvl].append(
+                    mlvl_mask_preds_y[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 0]])
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds_x[lvl] = torch.cat(
+                mlvl_pos_mask_preds_x[lvl], dim=0)
+            mlvl_pos_mask_preds_y[lvl] = torch.cat(
+                mlvl_pos_mask_preds_y[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = 0.
+        # dice loss
+        loss_mask = []
+        for pred_x, pred_y, target in \
+                zip(mlvl_pos_mask_preds_x,
+                    mlvl_pos_mask_preds_y, mlvl_pos_mask_targets):
+            num_masks = pred_x.size(0)
+            if num_masks == 0:
+                # make sure can get grad
+                loss_mask.append((pred_x.sum() + pred_y.sum()).unsqueeze(0))
+                continue
+            num_pos += num_masks
+            pred_mask = pred_y.sigmoid() * pred_x.sigmoid()
+            loss_mask.append(
+                self.loss_mask(pred_mask, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        # cate
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_bboxes,
+                            gt_labels,
+                            gt_masks,
+                            featmap_sizes=None):
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth bbox of each instance,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth label of each instance,
+                shape (num_gts,).
+            gt_masks (Tensor): Ground truth mask of each instance,
+                shape (num_gts, h, w).
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Default: None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_xy_pos_indexes (list[Tensor]): Each element
+                  in the list contains the index of positive samples in
+                  corresponding level, has shape (num_pos, 2), last
+                  dimension 2 present (index_x, index_y).
+        """
+        mlvl_pos_mask_targets, mlvl_labels, \
+            mlvl_pos_masks = \
+            super()._get_targets_single(gt_bboxes, gt_labels, gt_masks,
+                                        featmap_sizes=featmap_sizes)
+
+        mlvl_xy_pos_indexes = [(item - self.num_classes).nonzero()
+                               for item in mlvl_labels]
+
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_xy_pos_indexes
+
+    def get_results(self,
+                    mlvl_mask_preds_x,
+                    mlvl_mask_preds_y,
+                    mlvl_cls_scores,
+                    img_metas,
+                    rescale=None,
+                    **kwargs):
+        """Get multi-image mask results.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes ,num_grids ,num_grids).
+            img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds_x) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[i][img_id].view(
+                    -1, self.cls_out_channels).detach()
+                for i in range(num_levels)
+            ]
+            mask_pred_list_x = [
+                mlvl_mask_preds_x[i][img_id] for i in range(num_levels)
+            ]
+            mask_pred_list_y = [
+                mlvl_mask_preds_y[i][img_id] for i in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list_x = torch.cat(mask_pred_list_x, dim=0)
+            mask_pred_list_y = torch.cat(mask_pred_list_y, dim=0)
+
+            results = self._get_results_single(
+                cls_pred_list,
+                mask_pred_list_x,
+                mask_pred_list_y,
+                img_meta=img_metas[img_id],
+                cfg=self.test_cfg)
+            results_list.append(results)
+        return results_list
+
+    def _get_results_single(self, cls_scores, mask_preds_x, mask_preds_y,
+                            img_meta, cfg):
+        """Get processed mask related results of single image.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds_x (Tensor): Mask prediction of x branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            mask_preds_y (Tensor): Mask prediction of y branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict): Config used in test phase.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(results, cls_scores):
+            """Generate a empty results."""
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *results.ori_shape[:2])
+            results.labels = cls_scores.new_ones(0)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+
+        results = InstanceData(img_meta)
+        img_shape = results.img_shape
+        ori_shape = results.ori_shape
+        h, w, _ = img_shape
+        featmap_size = mask_preds_x.size()[-2:]
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        inds = score_mask.nonzero()
+        lvl_interval = inds.new_tensor(self.num_grids).pow(2).cumsum(0)
+        num_all_points = lvl_interval[-1]
+        lvl_start_index = inds.new_ones(num_all_points)
+        num_grids = inds.new_ones(num_all_points)
+        seg_size = inds.new_tensor(self.num_grids).cumsum(0)
+        mask_lvl_start_index = inds.new_ones(num_all_points)
+        strides = inds.new_ones(num_all_points)
+
+        lvl_start_index[:lvl_interval[0]] *= 0
+        mask_lvl_start_index[:lvl_interval[0]] *= 0
+        num_grids[:lvl_interval[0]] *= self.num_grids[0]
+        strides[:lvl_interval[0]] *= self.strides[0]
+
+        for lvl in range(1, self.num_levels):
+            lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                lvl_interval[lvl - 1]
+            mask_lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                seg_size[lvl - 1]
+            num_grids[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.num_grids[lvl]
+            strides[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.strides[lvl]
+
+        lvl_start_index = lvl_start_index[inds[:, 0]]
+        mask_lvl_start_index = mask_lvl_start_index[inds[:, 0]]
+        num_grids = num_grids[inds[:, 0]]
+        strides = strides[inds[:, 0]]
+
+        y_lvl_offset = (inds[:, 0] - lvl_start_index) // num_grids
+        x_lvl_offset = (inds[:, 0] - lvl_start_index) % num_grids
+        y_inds = mask_lvl_start_index + y_lvl_offset
+        x_inds = mask_lvl_start_index + x_lvl_offset
+
+        cls_labels = inds[:, 1]
+        mask_preds = mask_preds_x[x_inds, ...] * mask_preds_y[y_inds, ...]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(results, cls_scores)
+
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=ori_shape[:2], mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+
+        return results
+
+
+@HEADS.register_module()
+class DecoupledSOLOLightHead(DecoupledSOLOHead):
+    """Decoupled Light SOLO mask head used in `SOLO: Segmenting Objects by
+    Locations <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        with_dcn (bool): Whether use dcn in mask_convs and cls_convs,
+            default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 dcn_cfg=None,
+                 init_cfg=[
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs):
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        super(DecoupledSOLOLightHead, self).__init__(
+            *args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            if self.dcn_cfg is not None\
+                    and i == self.stacked_convs - 1:
+                conv_cfg = self.dcn_cfg
+            else:
+                conv_cfg = None
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, feats):
+        assert len(feats) == self.num_levels
+        feats = self.resize_feats(feats)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in self.mask_convs:
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
diff --git a/mmdet/models/dense_heads/solov2_head.py b/mmdet/models/dense_heads/solov2_head.py
new file mode 100755
index 0000000..975306c
--- /dev/null
+++ b/mmdet/models/dense_heads/solov2_head.py
@@ -0,0 +1,766 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+
+from mmdet.core import InstanceData, mask_matrix_nms, multi_apply
+from mmdet.core.utils import center_of_mass, generate_coordinate
+from mmdet.models.builder import HEADS
+from mmdet.utils.misc import floordiv
+from .solo_head import SOLOHead
+
+
+class MaskFeatModule(BaseModule):
+    """SOLOv2 mask feature map branch used in `SOLOv2: Dynamic and Fast
+    Instance Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        start_level (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        end_level (int): The ending feature map level from rpn that
+             will be used to predict the mask feature map.
+        out_channels (int): Number of output channels of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        mask_stride (int): Downsample factor of the mask feature map output.
+            Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 start_level,
+                 end_level,
+                 out_channels,
+                 mask_stride=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=[dict(type='Normal', layer='Conv2d', std=0.01)]):
+        super().__init__(init_cfg=init_cfg)
+
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        self.mask_stride = mask_stride
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+        self.fp16_enabled = False
+
+    def _init_layers(self):
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            if i == 0:
+                convs_per_level.add_module(
+                    f'conv{i}',
+                    ConvModule(
+                        self.in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                self.convs_all_levels.append(convs_per_level)
+                continue
+
+            for j in range(i):
+                if j == 0:
+                    if i == self.end_level:
+                        chn = self.in_channels + 2
+                    else:
+                        chn = self.in_channels
+                    convs_per_level.add_module(
+                        f'conv{j}',
+                        ConvModule(
+                            chn,
+                            self.feat_channels,
+                            3,
+                            padding=1,
+                            conv_cfg=self.conv_cfg,
+                            norm_cfg=self.norm_cfg,
+                            inplace=False))
+                    convs_per_level.add_module(
+                        f'upsample{j}',
+                        nn.Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=False))
+                    continue
+
+                convs_per_level.add_module(
+                    f'conv{j}',
+                    ConvModule(
+                        self.feat_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                convs_per_level.add_module(
+                    f'upsample{j}',
+                    nn.Upsample(
+                        scale_factor=2, mode='bilinear', align_corners=False))
+
+            self.convs_all_levels.append(convs_per_level)
+
+        self.conv_pred = ConvModule(
+            self.feat_channels,
+            self.out_channels,
+            1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+
+    @auto_fp16()
+    def forward(self, feats):
+        inputs = feats[self.start_level:self.end_level + 1]
+        assert len(inputs) == (self.end_level - self.start_level + 1)
+        feature_add_all_level = self.convs_all_levels[0](inputs[0])
+        for i in range(1, len(inputs)):
+            input_p = inputs[i]
+            if i == len(inputs) - 1:
+                coord_feat = generate_coordinate(input_p.size(),
+                                                 input_p.device)
+                input_p = torch.cat([input_p, coord_feat], 1)
+
+            # fix runtime error of "+=" inplace operation in PyTorch 1.10
+            feature_add_all_level = feature_add_all_level + \
+                self.convs_all_levels[i](input_p)
+
+        feature_pred = self.conv_pred(feature_add_all_level)
+        return feature_pred
+
+
+@HEADS.register_module()
+class SOLOV2Head(SOLOHead):
+    """SOLOv2 mask head used in `SOLOv2: Dynamic and Fast Instance
+    Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        mask_feature_head (dict): Config of SOLOv2MaskFeatHead.
+        dynamic_conv_size (int): Dynamic Conv kernel size. Default: 1.
+        dcn_cfg (dict): Dcn conv configurations in kernel_convs and cls_conv.
+            default: None.
+        dcn_apply_to_all_conv (bool): Whether to use dcn in every layer of
+            kernel_convs and cls_convs, or only the last layer. It shall be set
+            `True` for the normal version of SOLOv2 and `False` for the
+            light-weight version. default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 mask_feature_head,
+                 dynamic_conv_size=1,
+                 dcn_cfg=None,
+                 dcn_apply_to_all_conv=True,
+                 init_cfg=[
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs):
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        self.with_dcn = dcn_cfg is not None
+        self.dcn_apply_to_all_conv = dcn_apply_to_all_conv
+        self.dynamic_conv_size = dynamic_conv_size
+        mask_out_channels = mask_feature_head.get('out_channels')
+        self.kernel_out_channels = \
+            mask_out_channels * self.dynamic_conv_size * self.dynamic_conv_size
+
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+        # update the in_channels of mask_feature_head
+        if mask_feature_head.get('in_channels', None) is not None:
+            if mask_feature_head.in_channels != self.in_channels:
+                warnings.warn('The `in_channels` of SOLOv2MaskFeatHead and '
+                              'SOLOv2Head should be same, changing '
+                              'mask_feature_head.in_channels to '
+                              f'{self.in_channels}')
+                mask_feature_head.update(in_channels=self.in_channels)
+        else:
+            mask_feature_head.update(in_channels=self.in_channels)
+
+        self.mask_feature_head = MaskFeatModule(**mask_feature_head)
+        self.mask_stride = self.mask_feature_head.mask_stride
+        self.fp16_enabled = False
+
+    def _init_layers(self):
+        self.cls_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+        conv_cfg = None
+        for i in range(self.stacked_convs):
+            if self.with_dcn:
+                if self.dcn_apply_to_all_conv:
+                    conv_cfg = self.dcn_cfg
+                elif i == self.stacked_convs - 1:
+                    # light head
+                    conv_cfg = self.dcn_cfg
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.kernel_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+        self.conv_kernel = nn.Conv2d(
+            self.feat_channels, self.kernel_out_channels, 3, padding=1)
+
+    @auto_fp16()
+    def forward(self, feats):
+        assert len(feats) == self.num_levels
+        mask_feats = self.mask_feature_head(feats)
+        feats = self.resize_feats(feats)
+        mlvl_kernel_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            ins_kernel_feat = feats[i]
+            # ins branch
+            # concat coord
+            coord_feat = generate_coordinate(ins_kernel_feat.size(),
+                                             ins_kernel_feat.device)
+            ins_kernel_feat = torch.cat([ins_kernel_feat, coord_feat], 1)
+
+            # kernel branch
+            kernel_feat = ins_kernel_feat
+            kernel_feat = F.interpolate(
+                kernel_feat,
+                size=self.num_grids[i],
+                mode='bilinear',
+                align_corners=False)
+
+            cate_feat = kernel_feat[:, :-2, :, :]
+
+            kernel_feat = kernel_feat.contiguous()
+            for i, kernel_conv in enumerate(self.kernel_convs):
+                kernel_feat = kernel_conv(kernel_feat)
+            kernel_pred = self.conv_kernel(kernel_feat)
+
+            # cate branch
+            cate_feat = cate_feat.contiguous()
+            for i, cls_conv in enumerate(self.cls_convs):
+                cate_feat = cls_conv(cate_feat)
+            cate_pred = self.conv_cls(cate_feat)
+
+            mlvl_kernel_preds.append(kernel_pred)
+            mlvl_cls_preds.append(cate_pred)
+
+        return mlvl_kernel_preds, mlvl_cls_preds, mask_feats
+
+    def _get_targets_single(self,
+                            gt_bboxes,
+                            gt_labels,
+                            gt_masks,
+                            featmap_size=None):
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth bbox of each instance,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth label of each instance,
+                shape (num_gts,).
+            gt_masks (Tensor): Ground truth mask of each instance,
+                shape (num_gts, h, w).
+            featmap_sizes (:obj:`torch.size`): Size of UNified mask
+                feature map used to generate instance segmentation
+                masks by dynamic convolution, each element means
+                (feat_h, feat_w). Default: None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks  (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+                - mlvl_pos_indexes  (list[list]): Each element
+                  in the list contains the positive index in
+                  corresponding level, has shape (num_pos).
+        """
+
+        device = gt_labels.device
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+
+        mlvl_pos_mask_targets = []
+        mlvl_pos_indexes = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), num_grid \
+                in zip(self.scale_ranges, self.num_grids):
+            mask_target = []
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            pos_index = []
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    torch.zeros([0, featmap_size[0], featmap_size[1]],
+                                dtype=torch.uint8,
+                                device=device))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                mlvl_pos_indexes.append([])
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_size[0] * self.mask_stride,
+                                  featmap_size[1] * self.mask_stride)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / self.mask_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        this_mask_target = torch.zeros(
+                            [featmap_size[0], featmap_size[1]],
+                            dtype=torch.uint8,
+                            device=device)
+                        this_mask_target[:gt_mask.shape[0], :gt_mask.
+                                         shape[1]] = gt_mask
+                        mask_target.append(this_mask_target)
+                        pos_mask[index] = True
+                        pos_index.append(index)
+            if len(mask_target) == 0:
+                mask_target = torch.zeros(
+                    [0, featmap_size[0], featmap_size[1]],
+                    dtype=torch.uint8,
+                    device=device)
+            else:
+                mask_target = torch.stack(mask_target, 0)
+            mlvl_pos_mask_targets.append(mask_target)
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+            mlvl_pos_indexes.append(pos_index)
+        return (mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks,
+                mlvl_pos_indexes)
+
+    @force_fp32(apply_to=('mlvl_kernel_preds', 'mlvl_cls_preds', 'mask_feats'))
+    def loss(self,
+             mlvl_kernel_preds,
+             mlvl_cls_preds,
+             mask_feats,
+             gt_labels,
+             gt_masks,
+             img_metas,
+             gt_bboxes=None,
+             **kwargs):
+        """Calculate the loss of total batch.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            gt_labels (list[Tensor]): Labels of multiple images.
+            gt_masks (list[Tensor]): Ground truth masks of multiple images.
+                Each has shape (num_instances, h, w).
+            img_metas (list[dict]): Meta information of multiple images.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of multiple
+                images. Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_size = mask_feats.size()[-2:]
+
+        pos_mask_targets, labels, pos_masks, pos_indexes = multi_apply(
+            self._get_targets_single,
+            gt_bboxes,
+            gt_labels,
+            gt_masks,
+            featmap_size=featmap_size)
+
+        mlvl_mask_targets = [
+            torch.cat(lvl_mask_targets, 0)
+            for lvl_mask_targets in zip(*pos_mask_targets)
+        ]
+
+        mlvl_pos_kernel_preds = []
+        for lvl_kernel_preds, lvl_pos_indexes in zip(mlvl_kernel_preds,
+                                                     zip(*pos_indexes)):
+            lvl_pos_kernel_preds = []
+            for img_lvl_kernel_preds, img_lvl_pos_indexes in zip(
+                    lvl_kernel_preds, lvl_pos_indexes):
+                img_lvl_pos_kernel_preds = img_lvl_kernel_preds.view(
+                    img_lvl_kernel_preds.shape[0], -1)[:, img_lvl_pos_indexes]
+                lvl_pos_kernel_preds.append(img_lvl_pos_kernel_preds)
+            mlvl_pos_kernel_preds.append(lvl_pos_kernel_preds)
+
+        # make multilevel mlvl_mask_pred
+        mlvl_mask_preds = []
+        for lvl_pos_kernel_preds in mlvl_pos_kernel_preds:
+            lvl_mask_preds = []
+            for img_id, img_lvl_pos_kernel_pred in enumerate(
+                    lvl_pos_kernel_preds):
+                if img_lvl_pos_kernel_pred.size()[-1] == 0:
+                    continue
+                img_mask_feats = mask_feats[[img_id]]
+                h, w = img_mask_feats.shape[-2:]
+                num_kernel = img_lvl_pos_kernel_pred.shape[1]
+                img_lvl_mask_pred = F.conv2d(
+                    img_mask_feats,
+                    img_lvl_pos_kernel_pred.permute(1, 0).view(
+                        num_kernel, -1, self.dynamic_conv_size,
+                        self.dynamic_conv_size),
+                    stride=1).view(-1, h, w)
+                lvl_mask_preds.append(img_lvl_mask_pred)
+            if len(lvl_mask_preds) == 0:
+                lvl_mask_preds = None
+            else:
+                lvl_mask_preds = torch.cat(lvl_mask_preds, 0)
+            mlvl_mask_preds.append(lvl_mask_preds)
+        # dice loss
+        num_pos = 0
+        for img_pos_masks in pos_masks:
+            for lvl_img_pos_masks in img_pos_masks:
+                num_pos += lvl_img_pos_masks.count_nonzero()
+
+        loss_mask = []
+        for lvl_mask_preds, lvl_mask_targets in zip(mlvl_mask_preds,
+                                                    mlvl_mask_targets):
+            if lvl_mask_preds is None:
+                continue
+            loss_mask.append(
+                self.loss_mask(
+                    lvl_mask_preds,
+                    lvl_mask_targets,
+                    reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = mask_feats.sum() * 0
+
+        # cate
+        flatten_labels = [
+            torch.cat(
+                [img_lvl_labels.flatten() for img_lvl_labels in lvl_labels])
+            for lvl_labels in zip(*labels)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+
+        flatten_cls_preds = [
+            lvl_cls_preds.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+            for lvl_cls_preds in mlvl_cls_preds
+        ]
+        flatten_cls_preds = torch.cat(flatten_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    @force_fp32(
+        apply_to=('mlvl_kernel_preds', 'mlvl_cls_scores', 'mask_feats'))
+    def get_results(self, mlvl_kernel_preds, mlvl_cls_scores, mask_feats,
+                    img_metas, **kwargs):
+        """Get multi-image mask results.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        num_levels = len(mlvl_cls_scores)
+        assert len(mlvl_kernel_preds) == len(mlvl_cls_scores)
+
+        for lvl in range(num_levels):
+            cls_scores = mlvl_cls_scores[lvl]
+            cls_scores = cls_scores.sigmoid()
+            local_max = F.max_pool2d(cls_scores, 2, stride=1, padding=1)
+            keep_mask = local_max[:, :, :-1, :-1] == cls_scores
+            cls_scores = cls_scores * keep_mask
+            mlvl_cls_scores[lvl] = cls_scores.permute(0, 2, 3, 1)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            img_cls_pred = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            img_mask_feats = mask_feats[[img_id]]
+            img_kernel_pred = [
+                mlvl_kernel_preds[lvl][img_id].permute(1, 2, 0).view(
+                    -1, self.kernel_out_channels) for lvl in range(num_levels)
+            ]
+            img_cls_pred = torch.cat(img_cls_pred, dim=0)
+            img_kernel_pred = torch.cat(img_kernel_pred, dim=0)
+            result = self._get_results_single(
+                img_kernel_pred,
+                img_cls_pred,
+                img_mask_feats,
+                img_meta=img_metas[img_id])
+            result_list.append(result)
+        return result_list
+
+    def _get_results_single(self,
+                            kernel_preds,
+                            cls_scores,
+                            mask_feats,
+                            img_meta,
+                            cfg=None):
+        """Get processed mask related results of single image.
+
+        Args:
+            kernel_preds (Tensor): Dynamic kernel prediction of all points
+                in single image, has shape
+                (num_points, kernel_out_channels).
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Default: None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(results, cls_scores):
+            """Generate a empty results."""
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *results.ori_shape[:2])
+            results.labels = cls_scores.new_ones(0)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(kernel_preds) == len(cls_scores)
+        results = InstanceData(img_meta)
+
+        featmap_size = mask_feats.size()[-2:]
+
+        img_shape = results.img_shape
+        ori_shape = results.ori_shape
+
+        # overall info
+        h, w, _ = img_shape
+        upsampled_size = (featmap_size[0] * self.mask_stride,
+                          featmap_size[1] * self.mask_stride)
+
+        # process.
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(results, cls_scores)
+
+        # cate_labels & kernel_preds
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+        kernel_preds = kernel_preds[inds[:, 0]]
+
+        # trans vector.
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = kernel_preds.new_ones(lvl_interval[-1])
+
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+
+        # mask encoding.
+        kernel_preds = kernel_preds.view(
+            kernel_preds.size(0), -1, self.dynamic_conv_size,
+            self.dynamic_conv_size)
+        mask_preds = F.conv2d(
+            mask_feats, kernel_preds, stride=1).squeeze(0).sigmoid()
+        # mask.
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(results, cls_scores)
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0),
+            size=upsampled_size,
+            mode='bilinear',
+            align_corners=False)[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds,
+            size=ori_shape[:2],
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+
+        return results
diff --git a/mmdet/models/dense_heads/ssd_head.py b/mmdet/models/dense_heads/ssd_head.py
new file mode 100755
index 0000000..e362fd8
--- /dev/null
+++ b/mmdet/models/dense_heads/ssd_head.py
@@ -0,0 +1,357 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.runner import force_fp32
+
+from mmdet.core import (build_assigner, build_bbox_coder,
+                        build_prior_generator, build_sampler, multi_apply)
+from ..builder import HEADS
+from ..losses import smooth_l1_loss
+from .anchor_head import AnchorHead
+
+
+# TODO: add loss evaluator for SSD
+@HEADS.register_module()
+class SSDHead(AnchorHead):
+    """SSD head used in https://arxiv.org/abs/1512.02325.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Default: 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Default: 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Default: False.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: None.
+        act_cfg (dict): Dictionary to construct and config activation layer.
+            Default: None.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes=80,
+                 in_channels=(512, 1024, 512, 256, 256, 256),
+                 stacked_convs=0,
+                 feat_channels=256,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 anchor_generator=dict(
+                     type='SSDAnchorGenerator',
+                     scale_major=False,
+                     input_size=300,
+                     strides=[8, 16, 32, 64, 100, 300],
+                     ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
+                     basesize_ratio_range=(0.1, 0.9)),
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[.0, .0, .0, .0],
+                     target_stds=[1.0, 1.0, 1.0, 1.0],
+                 ),
+                 reg_decoded_bbox=False,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier',
+                     layer='Conv2d',
+                     distribution='uniform',
+                     bias=0)):
+        super(AnchorHead, self).__init__(init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.stacked_convs = stacked_convs
+        self.feat_channels = feat_channels
+        self.use_depthwise = use_depthwise
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.cls_out_channels = num_classes + 1  # add background class
+        self.prior_generator = build_prior_generator(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors
+
+        self._init_layers()
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = False
+        self.cls_focal_loss = False
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        # set sampling=False for archor_target
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # SSD sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.fp16_enabled = False
+
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            list[int]: Number of base_anchors on each point of each level.
+        """
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'please use "num_base_priors" instead')
+        return self.num_base_priors
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        # TODO: Use registry to choose ConvModule type
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+
+        for channel, num_base_priors in zip(self.in_channels,
+                                            self.num_base_priors):
+            cls_layers = []
+            reg_layers = []
+            in_channel = channel
+            # build stacked conv tower, not used in default ssd
+            for i in range(self.stacked_convs):
+                cls_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                in_channel = self.feat_channels
+            # SSD-Lite head
+            if self.use_depthwise:
+                cls_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            cls_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * self.cls_out_channels,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            reg_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * 4,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            self.cls_convs.append(nn.Sequential(*cls_layers))
+            self.reg_convs.append(nn.Sequential(*reg_layers))
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
+                                            self.cls_convs):
+            cls_scores.append(cls_conv(feat))
+            bbox_preds.append(reg_conv(feat))
+        return cls_scores, bbox_preds
+
+    def loss_single(self, cls_score, bbox_pred, anchor, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples):
+        """Compute loss of a single image.
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            num_total_samples (int): If sampling, num total samples equal to
+                the number of total anchors; Otherwise, it is the number of
+                positive anchors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        loss_cls_all = F.cross_entropy(
+            cls_score, labels, reduction='none') * label_weights
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        num_neg_samples = self.train_cfg.neg_pos_ratio * num_pos_samples
+        if num_neg_samples > neg_inds.size(0):
+            num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
+
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchor, bbox_pred)
+
+        loss_bbox = smooth_l1_loss(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=self.train_cfg.smoothl1_beta,
+            avg_factor=num_total_samples)
+        return loss_cls[None], loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=1,
+            unmap_outputs=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_images = len(img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            num_total_samples=num_total_pos)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/mmdet/models/dense_heads/tood_head.py b/mmdet/models/dense_heads/tood_head.py
new file mode 100755
index 0000000..c64ebf7
--- /dev/null
+++ b/mmdet/models/dense_heads/tood_head.py
@@ -0,0 +1,778 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init
+from mmcv.ops import deform_conv2d
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_assigner, distance2bbox,
+                        images_to_levels, multi_apply, reduce_mean, unmap)
+from mmdet.core.utils import filter_scores_and_topk
+from mmdet.models.utils import sigmoid_geometric_mean
+from ..builder import HEADS, build_loss
+from .atss_head import ATSSHead
+
+
+class TaskDecomposition(nn.Module):
+    """Task decomposition module in task-aligned predictor of TOOD.
+
+    Args:
+        feat_channels (int): Number of feature channels in TOOD head.
+        stacked_convs (int): Number of conv layers in TOOD head.
+        la_down_rate (int): Downsample rate of layer attention.
+        conv_cfg (dict): Config dict for convolution layer.
+        norm_cfg (dict): Config dict for normalization layer.
+    """
+
+    def __init__(self,
+                 feat_channels,
+                 stacked_convs,
+                 la_down_rate=8,
+                 conv_cfg=None,
+                 norm_cfg=None):
+        super(TaskDecomposition, self).__init__()
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.in_channels = self.feat_channels * self.stacked_convs
+        self.norm_cfg = norm_cfg
+        self.layer_attention = nn.Sequential(
+            nn.Conv2d(self.in_channels, self.in_channels // la_down_rate, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                self.in_channels // la_down_rate,
+                self.stacked_convs,
+                1,
+                padding=0), nn.Sigmoid())
+
+        self.reduction_conv = ConvModule(
+            self.in_channels,
+            self.feat_channels,
+            1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=norm_cfg is None)
+
+    def init_weights(self):
+        for m in self.layer_attention.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.reduction_conv.conv, std=0.01)
+
+    def forward(self, feat, avg_feat=None):
+        b, c, h, w = feat.shape
+        if avg_feat is None:
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+        weight = self.layer_attention(avg_feat)
+
+        # here we first compute the product between layer attention weight and
+        # conv weight, and then compute the convolution between new conv weight
+        # and feature map, in order to save memory and FLOPs.
+        conv_weight = weight.reshape(
+            b, 1, self.stacked_convs,
+            1) * self.reduction_conv.conv.weight.reshape(
+                1, self.feat_channels, self.stacked_convs, self.feat_channels)
+        conv_weight = conv_weight.reshape(b, self.feat_channels,
+                                          self.in_channels)
+        feat = feat.reshape(b, self.in_channels, h * w)
+        feat = torch.bmm(conv_weight, feat).reshape(b, self.feat_channels, h,
+                                                    w)
+        if self.norm_cfg is not None:
+            feat = self.reduction_conv.norm(feat)
+        feat = self.reduction_conv.activate(feat)
+
+        return feat
+
+
+@HEADS.register_module()
+class TOODHead(ATSSHead):
+    """TOODHead used in `TOOD: Task-aligned One-stage Object Detection.
+
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    TOOD uses Task-aligned head (T-head) and is optimized by Task Alignment
+    Learning (TAL).
+
+    Args:
+        num_dcn (int): Number of deformable convolution in the head.
+            Default: 0.
+        anchor_type (str): If set to `anchor_free`, the head will use centers
+            to regress bboxes. If set to `anchor_based`, the head will
+            regress bboxes based on anchors. Default: `anchor_free`.
+        initial_loss_cls (dict): Config of initial loss.
+
+    Example:
+        >>> self = TOODHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_dcn=0,
+                 anchor_type='anchor_free',
+                 initial_loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     activated=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 **kwargs):
+        assert anchor_type in ['anchor_free', 'anchor_based']
+        self.num_dcn = num_dcn
+        self.anchor_type = anchor_type
+        self.epoch = 0  # which would be update in SetEpochInfoHook!
+        super(TOODHead, self).__init__(num_classes, in_channels, **kwargs)
+
+        if self.train_cfg:
+            self.initial_epoch = self.train_cfg.initial_epoch
+            self.initial_assigner = build_assigner(
+                self.train_cfg.initial_assigner)
+            self.initial_loss_cls = build_loss(initial_loss_cls)
+            self.assigner = self.initial_assigner
+            self.alignment_assigner = build_assigner(self.train_cfg.assigner)
+            self.alpha = self.train_cfg.alpha
+            self.beta = self.train_cfg.beta
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.inter_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            if i < self.num_dcn:
+                conv_cfg = dict(type='DCNv2', deform_groups=4)
+            else:
+                conv_cfg = self.conv_cfg
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.inter_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.cls_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+        self.reg_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+
+        self.tood_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.tood_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+        self.cls_prob_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 1, 3, padding=1))
+        self.reg_offset_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 4 * 2, 3, padding=1))
+
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        bias_cls = bias_init_with_prob(0.01)
+        for m in self.inter_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.cls_prob_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.01)
+        for m in self.reg_offset_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.cls_prob_module[-1], std=0.01, bias=bias_cls)
+
+        self.cls_decomp.init_weights()
+        self.reg_decomp.init_weights()
+
+        normal_init(self.tood_cls, std=0.01, bias=bias_cls)
+        normal_init(self.tood_reg, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Decoded box for all scale levels,
+                    each is a 4D-tensor, the channels number is
+                    num_anchors * 4. In [tl_x, tl_y, br_x, br_y] format.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            b, c, h, w = x.shape
+            anchor = self.prior_generator.single_level_grid_priors(
+                (h, w), idx, device=x.device)
+            anchor = torch.cat([anchor for _ in range(b)])
+            # extract task interactive features
+            inter_feats = []
+            for inter_conv in self.inter_convs:
+                x = inter_conv(x)
+                inter_feats.append(x)
+            feat = torch.cat(inter_feats, 1)
+
+            # task decomposition
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+            cls_feat = self.cls_decomp(feat, avg_feat)
+            reg_feat = self.reg_decomp(feat, avg_feat)
+
+            # cls prediction and alignment
+            cls_logits = self.tood_cls(cls_feat)
+            cls_prob = self.cls_prob_module(feat)
+            cls_score = sigmoid_geometric_mean(cls_logits, cls_prob)
+
+            # reg prediction and alignment
+            if self.anchor_type == 'anchor_free':
+                reg_dist = scale(self.tood_reg(reg_feat).exp()).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = distance2bbox(
+                    self.anchor_center(anchor) / stride[0],
+                    reg_dist).reshape(b, h, w, 4).permute(0, 3, 1,
+                                                          2)  # (b, c, h, w)
+            elif self.anchor_type == 'anchor_based':
+                reg_dist = scale(self.tood_reg(reg_feat)).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = self.bbox_coder.decode(anchor, reg_dist).reshape(
+                    b, h, w, 4).permute(0, 3, 1, 2) / stride[0]
+            else:
+                raise NotImplementedError(
+                    f'Unknown anchor type: {self.anchor_type}.'
+                    f'Please use `anchor_free` or `anchor_based`.')
+            reg_offset = self.reg_offset_module(feat)
+            bbox_pred = self.deform_sampling(reg_bbox.contiguous(),
+                                             reg_offset.contiguous())
+
+            # After deform_sampling, some boxes will become invalid (The
+            # left-top point is at the right or bottom of the right-bottom
+            # point), which will make the GIoULoss negative.
+            invalid_bbox_idx = (bbox_pred[:, [0]] > bbox_pred[:, [2]]) | \
+                               (bbox_pred[:, [1]] > bbox_pred[:, [3]])
+            invalid_bbox_idx = invalid_bbox_idx.expand_as(bbox_pred)
+            bbox_pred = torch.where(invalid_bbox_idx, reg_bbox, bbox_pred)
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+    def deform_sampling(self, feat, offset):
+        """Sampling the feature x according to offset.
+
+        Args:
+            feat (Tensor): Feature
+            offset (Tensor): Spatial offset for feature sampling
+        """
+        # it is an equivalent implementation of bilinear interpolation
+        b, c, h, w = feat.shape
+        weight = feat.new_ones(c, 1, 1, 1)
+        y = deform_conv2d(feat, offset, weight, 1, 0, 1, c, c)
+        return y
+
+    def anchor_center(self, anchors):
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), "xy" format.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_single(self, anchors, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, alignment_metrics, stride):
+        """Compute loss of a single scale level.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Decoded bboxes for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors).
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            alignment_metrics (Tensor): Alignment metrics with shape
+                (N, num_total_anchors).
+            stride (tuple[int]): Downsample stride of the feature map.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        alignment_metrics = alignment_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = labels if self.epoch < self.initial_epoch else (
+            labels, alignment_metrics)
+        cls_loss_func = self.initial_loss_cls \
+            if self.epoch < self.initial_epoch else self.loss_cls
+
+        loss_cls = cls_loss_func(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+
+            # regression loss
+            pos_bbox_weight = self.centerness_target(
+                pos_anchors, pos_bbox_targets
+            ) if self.epoch < self.initial_epoch else alignment_metrics[
+                pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, alignment_metrics.sum(
+        ), pos_bbox_weight.sum()
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_bbox_preds = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) * stride[0]
+            for bbox_pred, stride in zip(bbox_preds,
+                                         self.prior_generator.strides)
+        ], 1)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bbox_preds,
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         alignment_metrics_list) = cls_reg_targets
+
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                alignment_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_priors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for cls_score, bbox_pred, priors, stride in zip(
+                cls_score_list, bbox_pred_list, mlvl_priors,
+                self.prior_generator.strides):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) * stride[0]
+            scores = cls_score.permute(1, 2,
+                                       0).reshape(-1, self.cls_out_channels)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bboxes = filtered_results['bbox_pred']
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        return self._bbox_post_process(mlvl_scores, mlvl_labels, mlvl_bboxes,
+                                       img_meta['scale_factor'], cfg, rescale,
+                                       with_nms, None, **kwargs)
+
+    def get_targets(self,
+                    cls_scores,
+                    bbox_preds,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores (Tensor): Classification predictions of images,
+                a 3D-Tensor with shape [num_imgs, num_priors, num_classes].
+            bbox_preds (Tensor): Decoded bboxes predictions of one image,
+                a 3D-Tensor with shape [num_imgs, num_priors, 4] in [tl_x,
+                tl_y, br_x, br_y] format.
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: a tuple containing learning targets.
+
+                - anchors_list (list[list[Tensor]]): Anchors of each level.
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - norm_alignment_metrics_list (list[Tensor]): Normalized
+                  alignment metrics of each level.
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        # anchor_list: list(b * [-1, 4])
+
+        if self.epoch < self.initial_epoch:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+                 super()._get_target_single,
+                 anchor_list,
+                 valid_flag_list,
+                 num_level_anchors_list,
+                 gt_bboxes_list,
+                 gt_bboxes_ignore_list,
+                 gt_labels_list,
+                 img_metas,
+                 label_channels=label_channels,
+                 unmap_outputs=unmap_outputs)
+            all_assign_metrics = [
+                weight[..., 0] for weight in all_bbox_weights
+            ]
+        else:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_assign_metrics) = multi_apply(
+                 self._get_target_single,
+                 cls_scores,
+                 bbox_preds,
+                 anchor_list,
+                 valid_flag_list,
+                 gt_bboxes_list,
+                 gt_bboxes_ignore_list,
+                 gt_labels_list,
+                 img_metas,
+                 label_channels=label_channels,
+                 unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        norm_alignment_metrics_list = images_to_levels(all_assign_metrics,
+                                                       num_level_anchors)
+
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, norm_alignment_metrics_list)
+
+    def _get_target_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           flat_anchors,
+                           valid_flags,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=1,
+                           unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            cls_scores (list(Tensor)): Box scores for each image.
+            bbox_preds (list(Tensor)): Box energies / deltas for each image.
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                anchors (Tensor): All anchors in the image with shape (N, 4).
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                norm_alignment_metrics (Tensor): Normalized alignment metrics
+                    of all priors in the image with shape (N,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        assign_result = self.alignment_assigner.assign(
+            cls_scores[inside_flags, :], bbox_preds[inside_flags, :], anchors,
+            gt_bboxes, gt_bboxes_ignore, gt_labels, self.alpha, self.beta)
+        assign_ious = assign_result.max_overlaps
+        assign_metrics = assign_result.assign_metrics
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        norm_alignment_metrics = anchors.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds ==
+                                     gt_inds]
+            pos_alignment_metrics = assign_metrics[gt_class_inds]
+            pos_ious = assign_ious[gt_class_inds]
+            pos_norm_alignment_metrics = pos_alignment_metrics / (
+                pos_alignment_metrics.max() + 10e-8) * pos_ious.max()
+            norm_alignment_metrics[gt_class_inds] = pos_norm_alignment_metrics
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            norm_alignment_metrics = unmap(norm_alignment_metrics,
+                                           num_total_anchors, inside_flags)
+        return (anchors, labels, label_weights, bbox_targets,
+                norm_alignment_metrics)
diff --git a/mmdet/models/dense_heads/vfnet_head.py b/mmdet/models/dense_heads/vfnet_head.py
new file mode 100755
index 0000000..ba285e2
--- /dev/null
+++ b/mmdet/models/dense_heads/vfnet_head.py
@@ -0,0 +1,740 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmcv.ops import DeformConv2d
+from mmcv.runner import force_fp32
+
+from mmdet.core import (MlvlPointGenerator, bbox_overlaps, build_assigner,
+                        build_prior_generator, build_sampler, multi_apply,
+                        reduce_mean)
+from ..builder import HEADS, build_loss
+from .atss_head import ATSSHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@HEADS.register_module()
+class VFNetHead(ATSSHead, FCOSHead):
+    """Head of `VarifocalNet (VFNet): An IoU-aware Dense Object
+    Detector.<https://arxiv.org/abs/2008.13367>`_.
+
+    The VFNet predicts IoU-aware classification scores which mix the
+    object presence confidence and object localization accuracy as the
+    detection score. It is built on the FCOS architecture and uses ATSS
+    for defining positive/negative training examples. The VFNet is trained
+    with Varifocal Loss and empolys star-shaped deformable convolution to
+    extract features for a bbox.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Default: False.
+        center_sample_radius (float): Radius of center sampling. Default: 1.5.
+        sync_num_pos (bool): If true, synchronize the number of positive
+            examples across GPUs. Default: True
+        gradient_mul (float): The multiplier to gradients from bbox refinement
+            and recognition. Default: 0.1.
+        bbox_norm_type (str): The bbox normalization type, 'reg_denom' or
+            'stride'. Default: reg_denom
+        loss_cls_fl (dict): Config of focal loss.
+        use_vfl (bool): If true, use varifocal loss for training.
+            Default: True.
+        loss_cls (dict): Config of varifocal loss.
+        loss_bbox (dict): Config of localization loss, GIoU Loss.
+        loss_bbox (dict): Config of localization refinement loss, GIoU Loss.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32,
+            requires_grad=True).
+        use_atss (bool): If true, use ATSS to define positive/negative
+            examples. Default: True.
+        anchor_generator (dict): Config of anchor generator for ATSS.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> self = VFNetHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
+                                 (512, INF)),
+                 center_sampling=False,
+                 center_sample_radius=1.5,
+                 sync_num_pos=True,
+                 gradient_mul=0.1,
+                 bbox_norm_type='reg_denom',
+                 loss_cls_fl=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 use_vfl=True,
+                 loss_cls=dict(
+                     type='VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='GIoULoss', loss_weight=1.5),
+                 loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0),
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 use_atss=True,
+                 reg_decoded_bbox=True,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     ratios=[1.0],
+                     octave_base_scale=8,
+                     scales_per_octave=1,
+                     center_offset=0.0,
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='vfnet_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        # dcn base offsets, adapted from reppoints_head.py
+        self.num_dconv_points = 9
+        self.dcn_kernel = int(np.sqrt(self.num_dconv_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super(FCOSHead, self).__init__(
+            num_classes,
+            in_channels,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.regress_ranges = regress_ranges
+        self.reg_denoms = [
+            regress_range[-1] for regress_range in regress_ranges
+        ]
+        self.reg_denoms[-1] = self.reg_denoms[-2] * 2
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.sync_num_pos = sync_num_pos
+        self.bbox_norm_type = bbox_norm_type
+        self.gradient_mul = gradient_mul
+        self.use_vfl = use_vfl
+        if self.use_vfl:
+            self.loss_cls = build_loss(loss_cls)
+        else:
+            self.loss_cls = build_loss(loss_cls_fl)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_bbox_refine = build_loss(loss_bbox_refine)
+
+        # for getting ATSS targets
+        self.use_atss = use_atss
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        self.anchor_center_offset = anchor_generator['center_offset']
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        # only be used in `get_atss_targets` when `use_atss` is True
+        self.atss_prior_generator = build_prior_generator(anchor_generator)
+
+        self.fcos_prior_generator = MlvlPointGenerator(
+            anchor_generator['strides'],
+            self.anchor_center_offset if self.use_atss else 0.5)
+
+        # In order to reuse the `get_bboxes` in `BaseDenseHead.
+        # Only be used in testing phase.
+        self.prior_generator = self.fcos_prior_generator
+
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            int: Number of anchors on each point of feature map.
+        """
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'please use "num_base_priors" instead')
+        return self.num_base_priors
+
+    @property
+    def anchor_generator(self):
+        warnings.warn('DeprecationWarning: anchor_generator is deprecated, '
+                      'please use "atss_prior_generator" instead')
+        return self.prior_generator
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super(FCOSHead, self)._init_cls_convs()
+        super(FCOSHead, self)._init_reg_convs()
+        self.relu = nn.ReLU(inplace=True)
+        self.vfnet_reg_conv = ConvModule(
+            self.feat_channels,
+            self.feat_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            bias=self.conv_bias)
+        self.vfnet_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_reg_refine_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_reg_refine = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales_refine = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_cls_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box offsets for each
+                    scale level, each is a 4D-tensor, the channel number is
+                    num_points * 4.
+                bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                    each scale level, each is a 4D-tensor, the channel
+                    number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, feats, self.scales,
+                           self.scales_refine, self.strides, self.reg_denoms)
+
+    def forward_single(self, x, scale, scale_refine, stride, reg_denom):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to
+                resize the refined bbox prediction.
+            stride (int): The corresponding stride for feature maps,
+                used to normalize the bbox prediction when
+                bbox_norm_type = 'stride'.
+            reg_denom (int): The corresponding regression range for feature
+                maps, only used to normalize the bbox prediction when
+                bbox_norm_type = 'reg_denom'.
+
+        Returns:
+            tuple: iou-aware cls scores for each box, bbox predictions and
+                refined bbox predictions of input feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+
+        # predict the bbox_pred of different level
+        reg_feat_init = self.vfnet_reg_conv(reg_feat)
+        if self.bbox_norm_type == 'reg_denom':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * reg_denom
+        elif self.bbox_norm_type == 'stride':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * stride
+        else:
+            raise NotImplementedError
+
+        # compute star deformable convolution offsets
+        # converting dcn_offset to reg_feat.dtype thus VFNet can be
+        # trained with FP16
+        dcn_offset = self.star_dcn_offset(bbox_pred, self.gradient_mul,
+                                          stride).to(reg_feat.dtype)
+
+        # refine the bbox_pred
+        reg_feat = self.relu(self.vfnet_reg_refine_dconv(reg_feat, dcn_offset))
+        bbox_pred_refine = scale_refine(
+            self.vfnet_reg_refine(reg_feat)).float().exp()
+        bbox_pred_refine = bbox_pred_refine * bbox_pred.detach()
+
+        # predict the iou-aware cls score
+        cls_feat = self.relu(self.vfnet_cls_dconv(cls_feat, dcn_offset))
+        cls_score = self.vfnet_cls(cls_feat)
+
+        if self.training:
+            return cls_score, bbox_pred, bbox_pred_refine
+        else:
+            return cls_score, bbox_pred_refine
+
+    def star_dcn_offset(self, bbox_pred, gradient_mul, stride):
+        """Compute the star deformable conv offsets.
+
+        Args:
+            bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b).
+            gradient_mul (float): Gradient multiplier.
+            stride (int): The corresponding stride for feature maps,
+                used to project the bbox onto the feature map.
+
+        Returns:
+            dcn_offsets (Tensor): The offsets for deformable convolution.
+        """
+        dcn_base_offset = self.dcn_base_offset.type_as(bbox_pred)
+        bbox_pred_grad_mul = (1 - gradient_mul) * bbox_pred.detach() + \
+            gradient_mul * bbox_pred
+        # map to the feature map scale
+        bbox_pred_grad_mul = bbox_pred_grad_mul / stride
+        N, C, H, W = bbox_pred.size()
+
+        x1 = bbox_pred_grad_mul[:, 0, :, :]
+        y1 = bbox_pred_grad_mul[:, 1, :, :]
+        x2 = bbox_pred_grad_mul[:, 2, :, :]
+        y2 = bbox_pred_grad_mul[:, 3, :, :]
+        bbox_pred_grad_mul_offset = bbox_pred.new_zeros(
+            N, 2 * self.num_dconv_points, H, W)
+        bbox_pred_grad_mul_offset[:, 0, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 1, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 2, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 4, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 5, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 7, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 11, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 12, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 13, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 14, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 16, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 17, :, :] = x2  # x2
+        dcn_offset = bbox_pred_grad_mul_offset - dcn_base_offset
+
+        return dcn_offset
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'bbox_preds_refine'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             bbox_preds_refine,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box offsets for each
+                scale level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                each scale level, each is a 4D-tensor, the channel
+                number is num_points * 4.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+                Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.fcos_prior_generator.grid_priors(
+            featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device)
+        labels, label_weights, bbox_targets, bbox_weights = self.get_targets(
+            cls_scores, all_level_points, gt_bboxes, gt_labels, img_metas,
+            gt_bboxes_ignore)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and bbox_preds_refine
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3,
+                              1).reshape(-1,
+                                         self.cls_out_channels).contiguous()
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds_refine = [
+            bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred_refine in bbox_preds_refine
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = torch.where(
+            ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0]
+        num_pos = len(pos_inds)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds]
+        pos_labels = flatten_labels[pos_inds]
+
+        # sync num_pos across all gpus
+        if self.sync_num_pos:
+            num_pos_avg_per_gpu = reduce_mean(
+                pos_inds.new_tensor(num_pos).float()).item()
+            num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0)
+        else:
+            num_pos_avg_per_gpu = num_pos
+
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_points = flatten_points[pos_inds]
+
+        pos_decoded_bbox_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_preds)
+        pos_decoded_target_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_targets)
+        iou_targets_ini = bbox_overlaps(
+            pos_decoded_bbox_preds,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_ini = iou_targets_ini.clone().detach()
+        bbox_avg_factor_ini = reduce_mean(
+            bbox_weights_ini.sum()).clamp_(min=1).item()
+
+        pos_decoded_bbox_preds_refine = \
+            self.bbox_coder.decode(pos_points, pos_bbox_preds_refine)
+        iou_targets_rf = bbox_overlaps(
+            pos_decoded_bbox_preds_refine,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_rf = iou_targets_rf.clone().detach()
+        bbox_avg_factor_rf = reduce_mean(
+            bbox_weights_rf.sum()).clamp_(min=1).item()
+
+        if num_pos > 0:
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_ini,
+                avg_factor=bbox_avg_factor_ini)
+
+            loss_bbox_refine = self.loss_bbox_refine(
+                pos_decoded_bbox_preds_refine,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_rf,
+                avg_factor=bbox_avg_factor_rf)
+
+            # build IoU-aware cls_score targets
+            if self.use_vfl:
+                pos_ious = iou_targets_rf.clone().detach()
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+                cls_iou_targets[pos_inds, pos_labels] = pos_ious
+        else:
+            loss_bbox = pos_bbox_preds.sum() * 0
+            loss_bbox_refine = pos_bbox_preds_refine.sum() * 0
+            if self.use_vfl:
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+
+        if self.use_vfl:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                cls_iou_targets,
+                avg_factor=num_pos_avg_per_gpu)
+        else:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                flatten_labels,
+                weight=label_weights,
+                avg_factor=num_pos_avg_per_gpu)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_bbox_rf=loss_bbox_refine)
+
+    def get_targets(self, cls_scores, mlvl_points, gt_bboxes, gt_labels,
+                    img_metas, gt_bboxes_ignore):
+        """A wrapper for computing ATSS and FCOS targets for points in multiple
+        images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+
+        Returns:
+            tuple:
+                labels_list (list[Tensor]): Labels of each level.
+                label_weights (Tensor/None): Label weights of all levels.
+                bbox_targets_list (list[Tensor]): Regression targets of each
+                    level, (l, t, r, b).
+                bbox_weights (Tensor/None): Bbox weights of all levels.
+        """
+        if self.use_atss:
+            return self.get_atss_targets(cls_scores, mlvl_points, gt_bboxes,
+                                         gt_labels, img_metas,
+                                         gt_bboxes_ignore)
+        else:
+            self.norm_on_bbox = False
+            return self.get_fcos_targets(mlvl_points, gt_bboxes, gt_labels)
+
+    def _get_target_single(self, *args, **kwargs):
+        """Avoid ambiguity in multiple inheritance."""
+        if self.use_atss:
+            return ATSSHead._get_target_single(self, *args, **kwargs)
+        else:
+            return FCOSHead._get_target_single(self, *args, **kwargs)
+
+    def get_fcos_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute FCOS regression and classification targets for points in
+        multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+
+        Returns:
+            tuple:
+                labels (list[Tensor]): Labels of each level.
+                label_weights: None, to be compatible with ATSS targets.
+                bbox_targets (list[Tensor]): BBox targets of each level.
+                bbox_weights: None, to be compatible with ATSS targets.
+        """
+        labels, bbox_targets = FCOSHead.get_targets(self, points,
+                                                    gt_bboxes_list,
+                                                    gt_labels_list)
+        label_weights = None
+        bbox_weights = None
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors
+
+        Returns:
+            tuple:
+                anchor_list (list[Tensor]): Anchors of each image.
+                valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.atss_prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = self.atss_prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device=device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def get_atss_targets(self,
+                         cls_scores,
+                         mlvl_points,
+                         gt_bboxes,
+                         gt_labels,
+                         img_metas,
+                         gt_bboxes_ignore=None):
+        """A wrapper for computing ATSS targets for points in multiple images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4). Default: None.
+
+        Returns:
+            tuple:
+                labels_list (list[Tensor]): Labels of each level.
+                label_weights (Tensor): Label weights of all levels.
+                bbox_targets_list (list[Tensor]): Regression targets of each
+                    level, (l, t, r, b).
+                bbox_weights (Tensor): Bbox weights of all levels.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(
+            featmap_sizes
+        ) == self.atss_prior_generator.num_levels == \
+            self.fcos_prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = ATSSHead.get_targets(
+            self,
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            unmap_outputs=True)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        bbox_targets_list = [
+            bbox_targets.reshape(-1, 4) for bbox_targets in bbox_targets_list
+        ]
+
+        num_imgs = len(img_metas)
+        # transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format
+        bbox_targets_list = self.transform_bbox_targets(
+            bbox_targets_list, mlvl_points, num_imgs)
+
+        labels_list = [labels.reshape(-1) for labels in labels_list]
+        label_weights_list = [
+            label_weights.reshape(-1) for label_weights in label_weights_list
+        ]
+        bbox_weights_list = [
+            bbox_weights.reshape(-1) for bbox_weights in bbox_weights_list
+        ]
+        label_weights = torch.cat(label_weights_list)
+        bbox_weights = torch.cat(bbox_weights_list)
+        return labels_list, label_weights, bbox_targets_list, bbox_weights
+
+    def transform_bbox_targets(self, decoded_bboxes, mlvl_points, num_imgs):
+        """Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format.
+
+        Args:
+            decoded_bboxes (list[Tensor]): Regression targets of each level,
+                in the form of (x1, y1, x2, y2).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            num_imgs (int): the number of images in a batch.
+
+        Returns:
+            bbox_targets (list[Tensor]): Regression targets of each level in
+                the form of (l, t, r, b).
+        """
+        # TODO: Re-implemented in Class PointCoder
+        assert len(decoded_bboxes) == len(mlvl_points)
+        num_levels = len(decoded_bboxes)
+        mlvl_points = [points.repeat(num_imgs, 1) for points in mlvl_points]
+        bbox_targets = []
+        for i in range(num_levels):
+            bbox_target = self.bbox_coder.encode(mlvl_points[i],
+                                                 decoded_bboxes[i])
+            bbox_targets.append(bbox_target)
+
+        return bbox_targets
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Override the method in the parent class to avoid changing para's
+        name."""
+        pass
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points according to feature map size.
+
+        This function will be deprecated soon.
+        """
+
+        warnings.warn(
+            '`_get_points_single` in `VFNetHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of a single level feature map'
+            'with `self.fcos_prior_generator.single_level_grid_priors` ')
+
+        h, w = featmap_size
+        x_range = torch.arange(
+            0, w * stride, stride, dtype=dtype, device=device)
+        y_range = torch.arange(
+            0, h * stride, stride, dtype=dtype, device=device)
+        y, x = torch.meshgrid(y_range, x_range)
+        # to be compatible with anchor points in ATSS
+        if self.use_atss:
+            points = torch.stack(
+                (x.reshape(-1), y.reshape(-1)), dim=-1) + \
+                     stride * self.anchor_center_offset
+        else:
+            points = torch.stack(
+                (x.reshape(-1), y.reshape(-1)), dim=-1) + stride // 2
+        return points
diff --git a/mmdet/models/dense_heads/yolact_head.py b/mmdet/models/dense_heads/yolact_head.py
new file mode 100755
index 0000000..8f89a27
--- /dev/null
+++ b/mmdet/models/dense_heads/yolact_head.py
@@ -0,0 +1,1018 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, ModuleList, force_fp32
+
+from mmdet.core import build_sampler, fast_nms, images_to_levels, multi_apply
+from mmdet.core.utils import select_single_mlvl
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class YOLACTHead(AnchorHead):
+    """YOLACT box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that YOLACT head is a light version of RetinaNet head.
+    Four differences are described as follows:
+
+    1. YOLACT box head has three-times fewer anchors.
+    2. YOLACT box head shares the convs for box and cls branches.
+    3. YOLACT box head uses OHEM instead of Focal loss.
+    4. YOLACT box head predicts a set of mask coefficients for each box.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (dict): Config dict for anchor generator
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        num_head_convs (int): Number of the conv layers shared by
+            box and cls branches.
+        num_protos (int): Number of the mask coefficients.
+        use_ohem (bool): If true, ``loss_single_OHEM`` will be used for
+            cls loss calculation. If false, ``loss_single`` will be used.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=3,
+                     scales_per_octave=1,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     reduction='none',
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+                 num_head_convs=1,
+                 num_protos=32,
+                 use_ohem=True,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier',
+                     distribution='uniform',
+                     bias=0,
+                     layer='Conv2d'),
+                 **kwargs):
+        self.num_head_convs = num_head_convs
+        self.num_protos = num_protos
+        self.use_ohem = use_ohem
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(YOLACTHead, self).__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+        if self.use_ohem:
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+            self.sampling = False
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.head_convs = ModuleList()
+        for i in range(self.num_head_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.head_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.conv_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.conv_coeff = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.num_protos,
+            3,
+            padding=1)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_anchors * 4.
+                coeff_pred (Tensor): Mask coefficients for a single scale \
+                    level, the channels number is num_anchors * num_protos.
+        """
+        for head_conv in self.head_convs:
+            x = head_conv(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        coeff_pred = self.conv_coeff(x).tanh()
+        return cls_score, bbox_pred, coeff_pred
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """A combination of the func:``AnchorHead.loss`` and
+        func:``SSDHead.loss``.
+
+        When ``self.use_ohem == True``, it functions like ``SSDHead.loss``,
+        otherwise, it follows ``AnchorHead.loss``. Besides, it additionally
+        returns ``sampling_results``.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            tuple:
+                dict[str, Tensor]: A dictionary of loss components.
+                List[:obj:``SamplingResult``]: Sampler results for each image.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            unmap_outputs=not self.use_ohem,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg, sampling_results) = cls_reg_targets
+
+        if self.use_ohem:
+            num_images = len(img_metas)
+            all_cls_scores = torch.cat([
+                s.permute(0, 2, 3, 1).reshape(
+                    num_images, -1, self.cls_out_channels) for s in cls_scores
+            ], 1)
+            all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+            all_label_weights = torch.cat(label_weights_list,
+                                          -1).view(num_images, -1)
+            all_bbox_preds = torch.cat([
+                b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+                for b in bbox_preds
+            ], -2)
+            all_bbox_targets = torch.cat(bbox_targets_list,
+                                         -2).view(num_images, -1, 4)
+            all_bbox_weights = torch.cat(bbox_weights_list,
+                                         -2).view(num_images, -1, 4)
+
+            # concat all level anchors to a single tensor
+            all_anchors = []
+            for i in range(num_images):
+                all_anchors.append(torch.cat(anchor_list[i]))
+
+            # check NaN and Inf
+            assert torch.isfinite(all_cls_scores).all().item(), \
+                'classification scores become infinite or NaN!'
+            assert torch.isfinite(all_bbox_preds).all().item(), \
+                'bbox predications become infinite or NaN!'
+
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_single_OHEM,
+                all_cls_scores,
+                all_bbox_preds,
+                all_anchors,
+                all_labels,
+                all_label_weights,
+                all_bbox_targets,
+                all_bbox_weights,
+                num_total_samples=num_total_pos)
+        else:
+            num_total_samples = (
+                num_total_pos +
+                num_total_neg if self.sampling else num_total_pos)
+
+            # anchor number of multi levels
+            num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+            # concat all level anchors and flags to a single tensor
+            concat_anchor_list = []
+            for i in range(len(anchor_list)):
+                concat_anchor_list.append(torch.cat(anchor_list[i]))
+            all_anchor_list = images_to_levels(concat_anchor_list,
+                                               num_level_anchors)
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_single,
+                cls_scores,
+                bbox_preds,
+                all_anchor_list,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                bbox_weights_list,
+                num_total_samples=num_total_samples)
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox), sampling_results
+
+    def loss_single_OHEM(self, cls_score, bbox_pred, anchors, labels,
+                         label_weights, bbox_targets, bbox_weights,
+                         num_total_samples):
+        """"See func:``SSDHead.loss``."""
+        loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        if num_pos_samples == 0:
+            num_neg_samples = neg_inds.size(0)
+        else:
+            num_neg_samples = self.train_cfg.neg_pos_ratio * num_pos_samples
+            if num_neg_samples > neg_inds.size(0):
+                num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+        return loss_cls[None], loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'coeff_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   coeff_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False):
+        """"Similar to func:``AnchorHead.get_bboxes``, but additionally
+        processes coeff_preds.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+
+        Returns:
+            list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
+                a 3-tuple. The first item is an (n, 5) tensor, where the
+                first 4 columns are bounding box positions
+                (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                between 0 and 1. The second item is an (n,) tensor where each
+                item is the predicted class label of the corresponding box.
+                The third item is an (n, num_protos) tensor where each item
+                is the predicted mask coefficients of instance inside the
+                corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+
+        det_bboxes = []
+        det_labels = []
+        det_coeffs = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            coeff_pred_list = select_single_mlvl(coeff_preds, img_id)
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            bbox_res = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                               coeff_pred_list, mlvl_anchors,
+                                               img_shape, scale_factor, cfg,
+                                               rescale)
+            det_bboxes.append(bbox_res[0])
+            det_labels.append(bbox_res[1])
+            det_coeffs.append(bbox_res[2])
+        return det_bboxes, det_labels, det_coeffs
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           coeff_preds_list,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        """"Similar to func:``AnchorHead._get_bboxes_single``, but additionally
+        processes coeff_preds_list and uses fast NMS instead of traditional
+        NMS.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores for a single scale level
+                Has shape (num_anchors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas for a single
+                scale level with shape (num_anchors * 4, H, W).
+            coeff_preds_list (list[Tensor]): Mask coefficients for a single
+                scale level with shape (num_anchors * num_protos, H, W).
+            mlvl_anchors (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+
+        Returns:
+            tuple[Tensor, Tensor, Tensor]: The first item is an (n, 5) tensor,
+                where the first 4 columns are bounding box positions
+                (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between
+                0 and 1. The second item is an (n,) tensor where each item is
+                the predicted class label of the corresponding box. The third
+                item is an (n, num_protos) tensor where each item is the
+                predicted mask coefficients of instance inside the
+                corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors)
+        nms_pre = cfg.get('nms_pre', -1)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_coeffs = []
+        for cls_score, bbox_pred, coeff_pred, anchors in \
+                zip(cls_score_list, bbox_pred_list,
+                    coeff_preds_list, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            coeff_pred = coeff_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_protos)
+
+            if 0 < nms_pre < scores.shape[0]:
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                coeff_pred = coeff_pred[topk_inds, :]
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_coeffs.append(coeff_pred)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_coeffs = torch.cat(mlvl_coeffs)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        det_bboxes, det_labels, det_coeffs = fast_nms(mlvl_bboxes, mlvl_scores,
+                                                      mlvl_coeffs,
+                                                      cfg.score_thr,
+                                                      cfg.iou_thr, cfg.top_k,
+                                                      cfg.max_per_img)
+        return det_bboxes, det_labels, det_coeffs
+
+
+@HEADS.register_module()
+class YOLACTSegmHead(BaseModule):
+    """YOLACT segmentation head used in https://arxiv.org/abs/1904.02689.
+
+    Apply a semantic segmentation loss on feature space using layers that are
+    only evaluated during training to increase performance with no speed
+    penalty.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_segm (dict): Config of semantic segmentation loss.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels=256,
+                 loss_segm=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 init_cfg=dict(
+                     type='Xavier',
+                     distribution='uniform',
+                     override=dict(name='segm_conv'))):
+        super(YOLACTSegmHead, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.loss_segm = build_loss(loss_segm)
+        self._init_layers()
+        self.fp16_enabled = False
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.segm_conv = nn.Conv2d(
+            self.in_channels, self.num_classes, kernel_size=1)
+
+    def forward(self, x):
+        """Forward feature from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: Predicted semantic segmentation map with shape
+                (N, num_classes, H, W).
+        """
+        return self.segm_conv(x)
+
+    @force_fp32(apply_to=('segm_pred', ))
+    def loss(self, segm_pred, gt_masks, gt_labels):
+        """Compute loss of the head.
+
+        Args:
+            segm_pred (list[Tensor]): Predicted semantic segmentation map
+                with shape (N, num_classes, H, W).
+            gt_masks (list[Tensor]): Ground truth masks for each image with
+                the same shape of the input image.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        loss_segm = []
+        num_imgs, num_classes, mask_h, mask_w = segm_pred.size()
+        for idx in range(num_imgs):
+            cur_segm_pred = segm_pred[idx]
+            cur_gt_masks = gt_masks[idx].float()
+            cur_gt_labels = gt_labels[idx]
+            segm_targets = self.get_targets(cur_segm_pred, cur_gt_masks,
+                                            cur_gt_labels)
+            if segm_targets is None:
+                loss = self.loss_segm(cur_segm_pred,
+                                      torch.zeros_like(cur_segm_pred),
+                                      torch.zeros_like(cur_segm_pred))
+            else:
+                loss = self.loss_segm(
+                    cur_segm_pred,
+                    segm_targets,
+                    avg_factor=num_imgs * mask_h * mask_w)
+            loss_segm.append(loss)
+        return dict(loss_segm=loss_segm)
+
+    def get_targets(self, segm_pred, gt_masks, gt_labels):
+        """Compute semantic segmentation targets for each image.
+
+        Args:
+            segm_pred (Tensor): Predicted semantic segmentation map
+                with shape (num_classes, H, W).
+            gt_masks (Tensor): Ground truth masks for each image with
+                the same shape of the input image.
+            gt_labels (Tensor): Class indices corresponding to each box.
+
+        Returns:
+            Tensor: Semantic segmentation targets with shape
+                (num_classes, H, W).
+        """
+        if gt_masks.size(0) == 0:
+            return None
+        num_classes, mask_h, mask_w = segm_pred.size()
+        with torch.no_grad():
+            downsampled_masks = F.interpolate(
+                gt_masks.unsqueeze(0), (mask_h, mask_w),
+                mode='bilinear',
+                align_corners=False).squeeze(0)
+            downsampled_masks = downsampled_masks.gt(0.5).float()
+            segm_targets = torch.zeros_like(segm_pred, requires_grad=False)
+            for obj_idx in range(downsampled_masks.size(0)):
+                segm_targets[gt_labels[obj_idx] - 1] = torch.max(
+                    segm_targets[gt_labels[obj_idx] - 1],
+                    downsampled_masks[obj_idx])
+            return segm_targets
+
+    def simple_test(self, feats, img_metas, rescale=False):
+        """Test function without test-time augmentation."""
+        raise NotImplementedError(
+            'simple_test of YOLACTSegmHead is not implemented '
+            'because this head is only evaluated during training')
+
+
+@HEADS.register_module()
+class YOLACTProtonet(BaseModule):
+    """YOLACT mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask prototypes for YOLACT.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        proto_channels (tuple[int]): Output channels of protonet convs.
+        proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs.
+        include_last_relu (Bool): If keep the last relu of protonet.
+        num_protos (int): Number of prototypes.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_mask_weight (float): Reweight the mask loss by this factor.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels=256,
+                 proto_channels=(256, 256, 256, None, 256, 32),
+                 proto_kernel_sizes=(3, 3, 3, -2, 3, 1),
+                 include_last_relu=True,
+                 num_protos=32,
+                 loss_mask_weight=1.0,
+                 max_masks_to_train=100,
+                 init_cfg=dict(
+                     type='Xavier',
+                     distribution='uniform',
+                     override=dict(name='protonet'))):
+        super(YOLACTProtonet, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.proto_channels = proto_channels
+        self.proto_kernel_sizes = proto_kernel_sizes
+        self.include_last_relu = include_last_relu
+        self.protonet = self._init_layers()
+
+        self.loss_mask_weight = loss_mask_weight
+        self.num_protos = num_protos
+        self.num_classes = num_classes
+        self.max_masks_to_train = max_masks_to_train
+        self.fp16_enabled = False
+
+    def _init_layers(self):
+        """A helper function to take a config setting and turn it into a
+        network."""
+        # Possible patterns:
+        # ( 256, 3) -> conv
+        # ( 256,-2) -> deconv
+        # (None,-2) -> bilinear interpolate
+        in_channels = self.in_channels
+        protonets = ModuleList()
+        for num_channels, kernel_size in zip(self.proto_channels,
+                                             self.proto_kernel_sizes):
+            if kernel_size > 0:
+                layer = nn.Conv2d(
+                    in_channels,
+                    num_channels,
+                    kernel_size,
+                    padding=kernel_size // 2)
+            else:
+                if num_channels is None:
+                    layer = InterpolateModule(
+                        scale_factor=-kernel_size,
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    layer = nn.ConvTranspose2d(
+                        in_channels,
+                        num_channels,
+                        -kernel_size,
+                        padding=kernel_size // 2)
+            protonets.append(layer)
+            protonets.append(nn.ReLU(inplace=True))
+            in_channels = num_channels if num_channels is not None \
+                else in_channels
+        if not self.include_last_relu:
+            protonets = protonets[:-1]
+        return nn.Sequential(*protonets)
+
+    def forward_dummy(self, x):
+        prototypes = self.protonet(x)
+        return prototypes
+
+    def forward(self, x, coeff_pred, bboxes, img_meta, sampling_results=None):
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+            coeff_pred (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W).
+            bboxes (list[Tensor]): Box used for cropping with shape
+                (N, num_anchors * 4, H, W). During training, they are
+                ground truth boxes. During testing, they are predicted
+                boxes.
+            img_meta (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            sampling_results (List[:obj:``SamplingResult``]): Sampler results
+                for each image.
+
+        Returns:
+            list[Tensor]: Predicted instance segmentation masks.
+        """
+        prototypes = self.protonet(x)
+        prototypes = prototypes.permute(0, 2, 3, 1).contiguous()
+
+        num_imgs = x.size(0)
+
+        # The reason for not using self.training is that
+        # val workflow will have a dimension mismatch error.
+        # Note that this writing method is very tricky.
+        # Fix https://github.com/open-mmlab/mmdetection/issues/5978
+        is_train_or_val_workflow = (coeff_pred[0].dim() == 4)
+
+        # Train or val workflow
+        if is_train_or_val_workflow:
+            coeff_pred_list = []
+            for coeff_pred_per_level in coeff_pred:
+                coeff_pred_per_level = \
+                    coeff_pred_per_level.permute(
+                        0, 2, 3, 1).reshape(num_imgs, -1, self.num_protos)
+                coeff_pred_list.append(coeff_pred_per_level)
+            coeff_pred = torch.cat(coeff_pred_list, dim=1)
+
+        mask_pred_list = []
+        for idx in range(num_imgs):
+            cur_prototypes = prototypes[idx]
+            cur_coeff_pred = coeff_pred[idx]
+            cur_bboxes = bboxes[idx]
+            cur_img_meta = img_meta[idx]
+
+            # Testing state
+            if not is_train_or_val_workflow:
+                bboxes_for_cropping = cur_bboxes
+            else:
+                cur_sampling_results = sampling_results[idx]
+                pos_assigned_gt_inds = \
+                    cur_sampling_results.pos_assigned_gt_inds
+                bboxes_for_cropping = cur_bboxes[pos_assigned_gt_inds].clone()
+                pos_inds = cur_sampling_results.pos_inds
+                cur_coeff_pred = cur_coeff_pred[pos_inds]
+
+            # Linearly combine the prototypes with the mask coefficients
+            mask_pred = cur_prototypes @ cur_coeff_pred.t()
+            mask_pred = torch.sigmoid(mask_pred)
+
+            h, w = cur_img_meta['img_shape'][:2]
+            bboxes_for_cropping[:, 0] /= w
+            bboxes_for_cropping[:, 1] /= h
+            bboxes_for_cropping[:, 2] /= w
+            bboxes_for_cropping[:, 3] /= h
+
+            mask_pred = self.crop(mask_pred, bboxes_for_cropping)
+            mask_pred = mask_pred.permute(2, 0, 1).contiguous()
+            mask_pred_list.append(mask_pred)
+        return mask_pred_list
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, gt_masks, gt_bboxes, img_meta, sampling_results):
+        """Compute loss of the head.
+
+        Args:
+            mask_pred (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_masks (list[Tensor]): Ground truth masks for each image with
+                the same shape of the input image.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            img_meta (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            sampling_results (List[:obj:``SamplingResult``]): Sampler results
+                for each image.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        loss_mask = []
+        num_imgs = len(mask_pred)
+        total_pos = 0
+        for idx in range(num_imgs):
+            cur_mask_pred = mask_pred[idx]
+            cur_gt_masks = gt_masks[idx].float()
+            cur_gt_bboxes = gt_bboxes[idx]
+            cur_img_meta = img_meta[idx]
+            cur_sampling_results = sampling_results[idx]
+
+            pos_assigned_gt_inds = cur_sampling_results.pos_assigned_gt_inds
+            num_pos = pos_assigned_gt_inds.size(0)
+            # Since we're producing (near) full image masks,
+            # it'd take too much vram to backprop on every single mask.
+            # Thus we select only a subset.
+            if num_pos > self.max_masks_to_train:
+                perm = torch.randperm(num_pos)
+                select = perm[:self.max_masks_to_train]
+                cur_mask_pred = cur_mask_pred[select]
+                pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+                num_pos = self.max_masks_to_train
+            total_pos += num_pos
+
+            gt_bboxes_for_reweight = cur_gt_bboxes[pos_assigned_gt_inds]
+
+            mask_targets = self.get_targets(cur_mask_pred, cur_gt_masks,
+                                            pos_assigned_gt_inds)
+            if num_pos == 0:
+                loss = cur_mask_pred.sum() * 0.
+            elif mask_targets is None:
+                loss = F.binary_cross_entropy(cur_mask_pred,
+                                              torch.zeros_like(cur_mask_pred),
+                                              torch.zeros_like(cur_mask_pred))
+            else:
+                cur_mask_pred = torch.clamp(cur_mask_pred, 0, 1)
+                loss = F.binary_cross_entropy(
+                    cur_mask_pred, mask_targets,
+                    reduction='none') * self.loss_mask_weight
+
+                h, w = cur_img_meta['img_shape'][:2]
+                gt_bboxes_width = (gt_bboxes_for_reweight[:, 2] -
+                                   gt_bboxes_for_reweight[:, 0]) / w
+                gt_bboxes_height = (gt_bboxes_for_reweight[:, 3] -
+                                    gt_bboxes_for_reweight[:, 1]) / h
+                loss = loss.mean(dim=(1,
+                                      2)) / gt_bboxes_width / gt_bboxes_height
+                loss = torch.sum(loss)
+            loss_mask.append(loss)
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = [x / total_pos for x in loss_mask]
+
+        return dict(loss_mask=loss_mask)
+
+    def get_targets(self, mask_pred, gt_masks, pos_assigned_gt_inds):
+        """Compute instance segmentation targets for each image.
+
+        Args:
+            mask_pred (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_masks (Tensor): Ground truth masks for each image with
+                the same shape of the input image.
+            pos_assigned_gt_inds (Tensor): GT indices of the corresponding
+                positive samples.
+        Returns:
+            Tensor: Instance segmentation targets with shape
+                (num_instances, H, W).
+        """
+        if gt_masks.size(0) == 0:
+            return None
+        mask_h, mask_w = mask_pred.shape[-2:]
+        gt_masks = F.interpolate(
+            gt_masks.unsqueeze(0), (mask_h, mask_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        gt_masks = gt_masks.gt(0.5).float()
+        mask_targets = gt_masks[pos_assigned_gt_inds]
+        return mask_targets
+
+    def get_seg_masks(self, mask_pred, label_pred, img_meta, rescale):
+        """Resize, binarize, and format the instance mask predictions.
+
+        Args:
+            mask_pred (Tensor): shape (N, H, W).
+            label_pred (Tensor): shape (N, ).
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+        Returns:
+            list[ndarray]: Mask predictions grouped by their predicted classes.
+        """
+        ori_shape = img_meta['ori_shape']
+        scale_factor = img_meta['scale_factor']
+        if rescale:
+            img_h, img_w = ori_shape[:2]
+        else:
+            img_h = np.round(ori_shape[0] * scale_factor[1]).astype(np.int32)
+            img_w = np.round(ori_shape[1] * scale_factor[0]).astype(np.int32)
+
+        cls_segms = [[] for _ in range(self.num_classes)]
+        if mask_pred.size(0) == 0:
+            return cls_segms
+
+        mask_pred = F.interpolate(
+            mask_pred.unsqueeze(0), (img_h, img_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0) > 0.5
+        mask_pred = mask_pred.cpu().numpy().astype(np.uint8)
+
+        for m, l in zip(mask_pred, label_pred):
+            cls_segms[l].append(m)
+        return cls_segms
+
+    def crop(self, masks, boxes, padding=1):
+        """Crop predicted masks by zeroing out everything not in the predicted
+        bbox.
+
+        Args:
+            masks (Tensor): shape [H, W, N].
+            boxes (Tensor): bbox coords in relative point form with
+                shape [N, 4].
+
+        Return:
+            Tensor: The cropped masks.
+        """
+        h, w, n = masks.size()
+        x1, x2 = self.sanitize_coordinates(
+            boxes[:, 0], boxes[:, 2], w, padding, cast=False)
+        y1, y2 = self.sanitize_coordinates(
+            boxes[:, 1], boxes[:, 3], h, padding, cast=False)
+
+        rows = torch.arange(
+            w, device=masks.device, dtype=x1.dtype).view(1, -1,
+                                                         1).expand(h, w, n)
+        cols = torch.arange(
+            h, device=masks.device, dtype=x1.dtype).view(-1, 1,
+                                                         1).expand(h, w, n)
+
+        masks_left = rows >= x1.view(1, 1, -1)
+        masks_right = rows < x2.view(1, 1, -1)
+        masks_up = cols >= y1.view(1, 1, -1)
+        masks_down = cols < y2.view(1, 1, -1)
+
+        crop_mask = masks_left * masks_right * masks_up * masks_down
+
+        return masks * crop_mask.float()
+
+    def sanitize_coordinates(self, x1, x2, img_size, padding=0, cast=True):
+        """Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0,
+        and x2 <= image_size. Also converts from relative to absolute
+        coordinates and casts the results to long tensors.
+
+        Warning: this does things in-place behind the scenes so
+        copy if necessary.
+
+        Args:
+            _x1 (Tensor): shape (N, ).
+            _x2 (Tensor): shape (N, ).
+            img_size (int): Size of the input image.
+            padding (int): x1 >= padding, x2 <= image_size-padding.
+            cast (bool): If cast is false, the result won't be cast to longs.
+
+        Returns:
+            tuple:
+                x1 (Tensor): Sanitized _x1.
+                x2 (Tensor): Sanitized _x2.
+        """
+        x1 = x1 * img_size
+        x2 = x2 * img_size
+        if cast:
+            x1 = x1.long()
+            x2 = x2.long()
+        x1 = torch.min(x1, x2)
+        x2 = torch.max(x1, x2)
+        x1 = torch.clamp(x1 - padding, min=0)
+        x2 = torch.clamp(x2 + padding, max=img_size)
+        return x1, x2
+
+    def simple_test(self,
+                    feats,
+                    det_bboxes,
+                    det_labels,
+                    det_coeffs,
+                    img_metas,
+                    rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+               upstream network, each is a 4D-tensor.
+            det_bboxes (list[Tensor]): BBox results of each image. each
+               element is (n, 5) tensor, where 5 represent
+               (tl_x, tl_y, br_x, br_y, score) and the score between 0 and 1.
+            det_labels (list[Tensor]): BBox results of each image. each
+               element is (n, ) tensor, each element represents the class
+               label of the corresponding box.
+            det_coeffs (list[Tensor]): BBox coefficient of each image. each
+               element is (n, m) tensor, m is vector length.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list]: encoded masks. The c-th item in the outer list
+                corresponds to the c-th class. Given the c-th outer list, the
+                i-th item in that inner list is the mask for the i-th box with
+                class label c.
+        """
+        num_imgs = len(img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            segm_results = [[[] for _ in range(self.num_classes)]
+                            for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4] *
+                scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                for i in range(len(det_bboxes))
+            ]
+            mask_preds = self.forward(feats[0], det_coeffs, _bboxes, img_metas)
+            # apply mask post-processing to each image individually
+            segm_results = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append([[] for _ in range(self.num_classes)])
+                else:
+                    segm_result = self.get_seg_masks(mask_preds[i],
+                                                     det_labels[i],
+                                                     img_metas[i], rescale)
+                    segm_results.append(segm_result)
+        return segm_results
+
+
+class InterpolateModule(BaseModule):
+    """This is a module version of F.interpolate.
+
+    Any arguments you give it just get passed along for the ride.
+    """
+
+    def __init__(self, *args, init_cfg=None, **kwargs):
+        super().__init__(init_cfg)
+
+        self.args = args
+        self.kwargs = kwargs
+
+    def forward(self, x):
+        """Forward features from the upstream network."""
+        return F.interpolate(x, *self.args, **self.kwargs)
diff --git a/mmdet/models/dense_heads/yolo_head.py b/mmdet/models/dense_heads/yolo_head.py
new file mode 100755
index 0000000..b446cb7
--- /dev/null
+++ b/mmdet/models/dense_heads/yolo_head.py
@@ -0,0 +1,621 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, bias_init_with_prob, constant_init, is_norm,
+                      normal_init)
+from mmcv.runner import force_fp32
+
+from mmdet.core import (build_assigner, build_bbox_coder,
+                        build_prior_generator, build_sampler, images_to_levels,
+                        multi_apply, multiclass_nms)
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class YOLOV3Head(BaseDenseHead, BBoxTestMixin):
+    """YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767.
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (List[int]): The number of output channels per scale
+            before the final 1x1 layer. Default: (1024, 512, 256).
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        featmap_strides (List[int]): The stride of each scale.
+            Should be in descending order. Default: (32, 16, 8).
+        one_hot_smoother (float): Set a non-zero value to enable label-smooth
+            Default: 0.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        loss_cls (dict): Config of classification loss.
+        loss_conf (dict): Config of confidence loss.
+        loss_xy (dict): Config of xy coordinate loss.
+        loss_wh (dict): Config of wh coordinate loss.
+        train_cfg (dict): Training config of YOLOV3 head. Default: None.
+        test_cfg (dict): Testing config of YOLOV3 head. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 out_channels=(1024, 512, 256),
+                 anchor_generator=dict(
+                     type='YOLOAnchorGenerator',
+                     base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                                 [(30, 61), (62, 45), (59, 119)],
+                                 [(10, 13), (16, 30), (33, 23)]],
+                     strides=[32, 16, 8]),
+                 bbox_coder=dict(type='YOLOBBoxCoder'),
+                 featmap_strides=[32, 16, 8],
+                 one_hot_smoother=0.,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_conf=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_xy=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_wh=dict(type='MSELoss', loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(
+                     type='Normal', std=0.01,
+                     override=dict(name='convs_pred'))):
+        super(YOLOV3Head, self).__init__(init_cfg)
+        # Check params
+        assert (len(in_channels) == len(out_channels) == len(featmap_strides))
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            if hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.fp16_enabled = False
+
+        self.one_hot_smoother = one_hot_smoother
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        self.prior_generator = build_prior_generator(anchor_generator)
+
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_conf = build_loss(loss_conf)
+        self.loss_xy = build_loss(loss_xy)
+        self.loss_wh = build_loss(loss_wh)
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        assert len(
+            self.prior_generator.num_base_priors) == len(featmap_strides)
+        self._init_layers()
+
+    @property
+    def anchor_generator(self):
+
+        warnings.warn('DeprecationWarning: `anchor_generator` is deprecated, '
+                      'please use "prior_generator" instead')
+        return self.prior_generator
+
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            int: Number of anchors on each point of feature map.
+        """
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'please use "num_base_priors" instead')
+        return self.num_base_priors
+
+    @property
+    def num_levels(self):
+        return len(self.featmap_strides)
+
+    @property
+    def num_attrib(self):
+        """int: number of attributes in pred_map, bboxes (4) +
+        objectness (1) + num_classes"""
+
+        return 5 + self.num_classes
+
+    def _init_layers(self):
+        self.convs_bridge = nn.ModuleList()
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_bridge = ConvModule(
+                self.in_channels[i],
+                self.out_channels[i],
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            conv_pred = nn.Conv2d(self.out_channels[i],
+                                  self.num_base_priors * self.num_attrib, 1)
+
+            self.convs_bridge.append(conv_bridge)
+            self.convs_pred.append(conv_pred)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        for conv_pred, stride in zip(self.convs_pred, self.featmap_strides):
+            bias = conv_pred.bias.reshape(self.num_base_priors, -1)
+            # init objectness with prior of 8 objects per feature map
+            # refer to https://github.com/ultralytics/yolov3
+            nn.init.constant_(bias.data[:, 4],
+                              bias_init_with_prob(8 / (608 / stride)**2))
+            nn.init.constant_(bias.data[:, 5:], bias_init_with_prob(0.01))
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[Tensor]: A tuple of multi-level predication map, each is a
+                4D-tensor of shape (batch_size, 5+num_classes, height, width).
+        """
+
+        assert len(feats) == self.num_levels
+        pred_maps = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            x = self.convs_bridge[i](x)
+            pred_map = self.convs_pred[i](x)
+            pred_maps.append(pred_map)
+
+        return tuple(pred_maps),
+
+    @force_fp32(apply_to=('pred_maps', ))
+    def get_bboxes(self,
+                   pred_maps,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions. It has
+        been accelerated since PR #5991.
+
+        Args:
+            pred_maps (list[Tensor]): Raw predictions for a batch of images.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used. Default: None.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where 5 represent
+                (tl_x, tl_y, br_x, br_y, score) and the score between 0 and 1.
+                The shape of the second tensor in the tuple is (n,), and
+                each element represents the class label of the corresponding
+                box.
+        """
+        assert len(pred_maps) == self.num_levels
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factors = np.array(
+            [img_meta['scale_factor'] for img_meta in img_metas])
+
+        num_imgs = len(img_metas)
+        featmap_sizes = [pred_map.shape[-2:] for pred_map in pred_maps]
+
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=pred_maps[0].device)
+        flatten_preds = []
+        flatten_strides = []
+        for pred, stride in zip(pred_maps, self.featmap_strides):
+            pred = pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_attrib)
+            pred[..., :2].sigmoid_()
+            flatten_preds.append(pred)
+            flatten_strides.append(
+                pred.new_tensor(stride).expand(pred.size(1)))
+
+        flatten_preds = torch.cat(flatten_preds, dim=1)
+        flatten_bbox_preds = flatten_preds[..., :4]
+        flatten_objectness = flatten_preds[..., 4].sigmoid()
+        flatten_cls_scores = flatten_preds[..., 5:].sigmoid()
+        flatten_anchors = torch.cat(mlvl_anchors)
+        flatten_strides = torch.cat(flatten_strides)
+        flatten_bboxes = self.bbox_coder.decode(flatten_anchors,
+                                                flatten_bbox_preds,
+                                                flatten_strides.unsqueeze(-1))
+
+        if with_nms and (flatten_objectness.size(0) == 0):
+            return torch.zeros((0, 5)), torch.zeros((0, ))
+
+        if rescale:
+            flatten_bboxes /= flatten_bboxes.new_tensor(
+                scale_factors).unsqueeze(1)
+
+        padding = flatten_bboxes.new_zeros(num_imgs, flatten_bboxes.shape[1],
+                                           1)
+        flatten_cls_scores = torch.cat([flatten_cls_scores, padding], dim=-1)
+
+        det_results = []
+        for (bboxes, scores, objectness) in zip(flatten_bboxes,
+                                                flatten_cls_scores,
+                                                flatten_objectness):
+            # Filtering out all predictions with conf < conf_thr
+            conf_thr = cfg.get('conf_thr', -1)
+            if conf_thr > 0:
+                conf_inds = objectness >= conf_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                cfg.score_thr,
+                cfg.nms,
+                cfg.max_per_img,
+                score_factors=objectness)
+            det_results.append(tuple([det_bboxes, det_labels]))
+        return det_results
+
+    @force_fp32(apply_to=('pred_maps', ))
+    def loss(self,
+             pred_maps,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            pred_maps (list[Tensor]): Prediction map for each scale level,
+                shape (N, num_anchors * num_attrib, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(img_metas)
+        device = pred_maps[0][0].device
+
+        featmap_sizes = [
+            pred_maps[i].shape[-2:] for i in range(self.num_levels)
+        ]
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [mlvl_anchors for _ in range(num_imgs)]
+
+        responsible_flag_list = []
+        for img_id in range(len(img_metas)):
+            responsible_flag_list.append(
+                self.prior_generator.responsible_flags(featmap_sizes,
+                                                       gt_bboxes[img_id],
+                                                       device))
+
+        target_maps_list, neg_maps_list = self.get_targets(
+            anchor_list, responsible_flag_list, gt_bboxes, gt_labels)
+
+        losses_cls, losses_conf, losses_xy, losses_wh = multi_apply(
+            self.loss_single, pred_maps, target_maps_list, neg_maps_list)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_conf=losses_conf,
+            loss_xy=losses_xy,
+            loss_wh=losses_wh)
+
+    def loss_single(self, pred_map, target_map, neg_map):
+        """Compute loss of a single image from a batch.
+
+        Args:
+            pred_map (Tensor): Raw predictions for a single level.
+            target_map (Tensor): The Ground-Truth target for a single level.
+            neg_map (Tensor): The negative masks for a single level.
+
+        Returns:
+            tuple:
+                loss_cls (Tensor): Classification loss.
+                loss_conf (Tensor): Confidence loss.
+                loss_xy (Tensor): Regression loss of x, y coordinate.
+                loss_wh (Tensor): Regression loss of w, h coordinate.
+        """
+
+        num_imgs = len(pred_map)
+        pred_map = pred_map.permute(0, 2, 3,
+                                    1).reshape(num_imgs, -1, self.num_attrib)
+        neg_mask = neg_map.float()
+        pos_mask = target_map[..., 4]
+        pos_and_neg_mask = neg_mask + pos_mask
+        pos_mask = pos_mask.unsqueeze(dim=-1)
+        if torch.max(pos_and_neg_mask) > 1.:
+            warnings.warn('There is overlap between pos and neg sample.')
+            pos_and_neg_mask = pos_and_neg_mask.clamp(min=0., max=1.)
+
+        pred_xy = pred_map[..., :2]
+        pred_wh = pred_map[..., 2:4]
+        pred_conf = pred_map[..., 4]
+        pred_label = pred_map[..., 5:]
+
+        target_xy = target_map[..., :2]
+        target_wh = target_map[..., 2:4]
+        target_conf = target_map[..., 4]
+        target_label = target_map[..., 5:]
+
+        loss_cls = self.loss_cls(pred_label, target_label, weight=pos_mask)
+        loss_conf = self.loss_conf(
+            pred_conf, target_conf, weight=pos_and_neg_mask)
+        loss_xy = self.loss_xy(pred_xy, target_xy, weight=pos_mask)
+        loss_wh = self.loss_wh(pred_wh, target_wh, weight=pos_mask)
+
+        return loss_cls, loss_conf, loss_xy, loss_wh
+
+    def get_targets(self, anchor_list, responsible_flag_list, gt_bboxes_list,
+                    gt_labels_list):
+        """Compute target maps for anchors in multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_total_anchors, 4).
+            responsible_flag_list (list[list[Tensor]]): Multi level responsible
+                flags of each image. Each element is a tensor of shape
+                (num_total_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+                - target_map_list (list[Tensor]): Target map of each level.
+                - neg_map_list (list[Tensor]): Negative map of each level.
+        """
+        num_imgs = len(anchor_list)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        results = multi_apply(self._get_targets_single, anchor_list,
+                              responsible_flag_list, gt_bboxes_list,
+                              gt_labels_list)
+
+        all_target_maps, all_neg_maps = results
+        assert num_imgs == len(all_target_maps) == len(all_neg_maps)
+        target_maps_list = images_to_levels(all_target_maps, num_level_anchors)
+        neg_maps_list = images_to_levels(all_neg_maps, num_level_anchors)
+
+        return target_maps_list, neg_maps_list
+
+    def _get_targets_single(self, anchors, responsible_flags, gt_bboxes,
+                            gt_labels):
+        """Generate matching bounding box prior and converted GT.
+
+        Args:
+            anchors (list[Tensor]): Multi-level anchors of the image.
+            responsible_flags (list[Tensor]): Multi-level responsible flags of
+                anchors
+            gt_bboxes (Tensor): Ground truth bboxes of single image.
+            gt_labels (Tensor): Ground truth labels of single image.
+
+        Returns:
+            tuple:
+                target_map (Tensor): Predication target map of each
+                    scale level, shape (num_total_anchors,
+                    5+num_classes)
+                neg_map (Tensor): Negative map of each scale level,
+                    shape (num_total_anchors,)
+        """
+
+        anchor_strides = []
+        for i in range(len(anchors)):
+            anchor_strides.append(
+                torch.tensor(self.featmap_strides[i],
+                             device=gt_bboxes.device).repeat(len(anchors[i])))
+        concat_anchors = torch.cat(anchors)
+        concat_responsible_flags = torch.cat(responsible_flags)
+
+        anchor_strides = torch.cat(anchor_strides)
+        assert len(anchor_strides) == len(concat_anchors) == \
+               len(concat_responsible_flags)
+        assign_result = self.assigner.assign(concat_anchors,
+                                             concat_responsible_flags,
+                                             gt_bboxes)
+        sampling_result = self.sampler.sample(assign_result, concat_anchors,
+                                              gt_bboxes)
+
+        target_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), self.num_attrib)
+
+        target_map[sampling_result.pos_inds, :4] = self.bbox_coder.encode(
+            sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes,
+            anchor_strides[sampling_result.pos_inds])
+
+        target_map[sampling_result.pos_inds, 4] = 1
+
+        gt_labels_one_hot = F.one_hot(
+            gt_labels, num_classes=self.num_classes).float()
+        if self.one_hot_smoother != 0:  # label smooth
+            gt_labels_one_hot = gt_labels_one_hot * (
+                1 - self.one_hot_smoother
+            ) + self.one_hot_smoother / self.num_classes
+        target_map[sampling_result.pos_inds, 5:] = gt_labels_one_hot[
+            sampling_result.pos_assigned_gt_inds]
+
+        neg_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), dtype=torch.uint8)
+        neg_map[sampling_result.neg_inds] = 1
+
+        return target_map, neg_map
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
+
+    @force_fp32(apply_to=('pred_maps'))
+    def onnx_export(self, pred_maps, img_metas, with_nms=True):
+        num_levels = len(pred_maps)
+        pred_maps_list = [pred_maps[i].detach() for i in range(num_levels)]
+
+        cfg = self.test_cfg
+        assert len(pred_maps_list) == self.num_levels
+
+        device = pred_maps_list[0].device
+        batch_size = pred_maps_list[0].shape[0]
+
+        featmap_sizes = [
+            pred_maps_list[i].shape[-2:] for i in range(self.num_levels)
+        ]
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        # convert to tensor to keep tracing
+        nms_pre_tensor = torch.tensor(
+            cfg.get('nms_pre', -1), device=device, dtype=torch.long)
+
+        multi_lvl_bboxes = []
+        multi_lvl_cls_scores = []
+        multi_lvl_conf_scores = []
+        for i in range(self.num_levels):
+            # get some key info for current scale
+            pred_map = pred_maps_list[i]
+            stride = self.featmap_strides[i]
+            # (b,h, w, num_anchors*num_attrib) ->
+            # (b,h*w*num_anchors, num_attrib)
+            pred_map = pred_map.permute(0, 2, 3,
+                                        1).reshape(batch_size, -1,
+                                                   self.num_attrib)
+            # Inplace operation like
+            # ```pred_map[..., :2] = \torch.sigmoid(pred_map[..., :2])```
+            # would create constant tensor when exporting to onnx
+            pred_map_conf = torch.sigmoid(pred_map[..., :2])
+            pred_map_rest = pred_map[..., 2:]
+            pred_map = torch.cat([pred_map_conf, pred_map_rest], dim=-1)
+            pred_map_boxes = pred_map[..., :4]
+            multi_lvl_anchor = mlvl_anchors[i]
+            multi_lvl_anchor = multi_lvl_anchor.expand_as(pred_map_boxes)
+            bbox_pred = self.bbox_coder.decode(multi_lvl_anchor,
+                                               pred_map_boxes, stride)
+            # conf and cls
+            conf_pred = torch.sigmoid(pred_map[..., 4])
+            cls_pred = torch.sigmoid(pred_map[..., 5:]).view(
+                batch_size, -1, self.num_classes)  # Cls pred one-hot.
+
+            # Get top-k prediction
+            from mmdet.core.export import get_k_for_topk
+            nms_pre = get_k_for_topk(nms_pre_tensor, bbox_pred.shape[1])
+            if nms_pre > 0:
+                _, topk_inds = conf_pred.topk(nms_pre)
+                batch_inds = torch.arange(batch_size).view(
+                    -1, 1).expand_as(topk_inds).long()
+                # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
+                transformed_inds = (
+                    bbox_pred.shape[1] * batch_inds + topk_inds)
+                bbox_pred = bbox_pred.reshape(-1,
+                                              4)[transformed_inds, :].reshape(
+                                                  batch_size, -1, 4)
+                cls_pred = cls_pred.reshape(
+                    -1, self.num_classes)[transformed_inds, :].reshape(
+                        batch_size, -1, self.num_classes)
+                conf_pred = conf_pred.reshape(-1, 1)[transformed_inds].reshape(
+                    batch_size, -1)
+
+            # Save the result of current scale
+            multi_lvl_bboxes.append(bbox_pred)
+            multi_lvl_cls_scores.append(cls_pred)
+            multi_lvl_conf_scores.append(conf_pred)
+
+        # Merge the results of different scales together
+        batch_mlvl_bboxes = torch.cat(multi_lvl_bboxes, dim=1)
+        batch_mlvl_scores = torch.cat(multi_lvl_cls_scores, dim=1)
+        batch_mlvl_conf_scores = torch.cat(multi_lvl_conf_scores, dim=1)
+
+        # Replace multiclass_nms with ONNX::NonMaxSuppression in deployment
+        from mmdet.core.export import add_dummy_nms_for_onnx
+        conf_thr = cfg.get('conf_thr', -1)
+        score_thr = cfg.get('score_thr', -1)
+        # follow original pipeline of YOLOv3
+        if conf_thr > 0:
+            mask = (batch_mlvl_conf_scores >= conf_thr).float()
+            batch_mlvl_conf_scores *= mask
+        if score_thr > 0:
+            mask = (batch_mlvl_scores > score_thr).float()
+            batch_mlvl_scores *= mask
+        batch_mlvl_conf_scores = batch_mlvl_conf_scores.unsqueeze(2).expand_as(
+            batch_mlvl_scores)
+        batch_mlvl_scores = batch_mlvl_scores * batch_mlvl_conf_scores
+        if with_nms:
+            max_output_boxes_per_class = cfg.nms.get(
+                'max_output_boxes_per_class', 200)
+            iou_threshold = cfg.nms.get('iou_threshold', 0.5)
+            # keep aligned with original pipeline, improve
+            # mAP by 1% for YOLOv3 in ONNX
+            score_threshold = 0
+            nms_pre = cfg.get('deploy_nms_pre', -1)
+            return add_dummy_nms_for_onnx(
+                batch_mlvl_bboxes,
+                batch_mlvl_scores,
+                max_output_boxes_per_class,
+                iou_threshold,
+                score_threshold,
+                nms_pre,
+                cfg.max_per_img,
+            )
+        else:
+            return batch_mlvl_bboxes, batch_mlvl_scores
diff --git a/mmdet/models/dense_heads/yolof_head.py b/mmdet/models/dense_heads/yolof_head.py
new file mode 100755
index 0000000..1063524
--- /dev/null
+++ b/mmdet/models/dense_heads/yolof_head.py
@@ -0,0 +1,416 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, bias_init_with_prob, constant_init, is_norm,
+                      normal_init)
+from mmcv.runner import force_fp32
+
+from mmdet.core import anchor_inside_flags, multi_apply, reduce_mean, unmap
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+INF = 1e8
+
+
+def levels_to_images(mlvl_tensor):
+    """Concat multi-level feature maps by image.
+
+    [feature_level0, feature_level1...] -> [feature_image0, feature_image1...]
+    Convert the shape of each element in mlvl_tensor from (N, C, H, W) to
+    (N, H*W , C), then split the element to N elements with shape (H*W, C), and
+    concat elements in same image of all level along first dimension.
+
+    Args:
+        mlvl_tensor (list[torch.Tensor]): list of Tensor which collect from
+            corresponding level. Each element is of shape (N, C, H, W)
+
+    Returns:
+        list[torch.Tensor]: A list that contains N tensors and each tensor is
+            of shape (num_elements, C)
+    """
+    batch_size = mlvl_tensor[0].size(0)
+    batch_list = [[] for _ in range(batch_size)]
+    channels = mlvl_tensor[0].size(1)
+    for t in mlvl_tensor:
+        t = t.permute(0, 2, 3, 1)
+        t = t.view(batch_size, -1, channels).contiguous()
+        for img in range(batch_size):
+            batch_list[img].append(t[img])
+    return [torch.cat(item, 0) for item in batch_list]
+
+
+@HEADS.register_module()
+class YOLOFHead(AnchorHead):
+    """YOLOFHead Paper link: https://arxiv.org/abs/2103.09460.
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (List[int]): The number of input channels per scale.
+        cls_num_convs (int): The number of convolutions of cls branch.
+           Default 2.
+        reg_num_convs (int): The number of convolutions of reg branch.
+           Default 4.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_cls_convs=2,
+                 num_reg_convs=4,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 **kwargs):
+        self.num_cls_convs = num_cls_convs
+        self.num_reg_convs = num_reg_convs
+        self.norm_cfg = norm_cfg
+        super(YOLOFHead, self).__init__(num_classes, in_channels, **kwargs)
+
+    def _init_layers(self):
+        cls_subnet = []
+        bbox_subnet = []
+        for i in range(self.num_cls_convs):
+            cls_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        for i in range(self.num_reg_convs):
+            bbox_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * self.num_classes,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.bbox_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * 4,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.object_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        bias_cls = bias_init_with_prob(0.01)
+        torch.nn.init.constant_(self.cls_score.bias, bias_cls)
+
+    def forward_single(self, feature):
+        cls_score = self.cls_score(self.cls_subnet(feature))
+        N, _, H, W = cls_score.shape
+        cls_score = cls_score.view(N, -1, self.num_classes, H, W)
+
+        reg_feat = self.bbox_subnet(feature)
+        bbox_reg = self.bbox_pred(reg_feat)
+        objectness = self.object_pred(reg_feat)
+
+        # implicit objectness
+        objectness = objectness.view(N, -1, 1, H, W)
+        normalized_cls_score = cls_score + objectness - torch.log(
+            1. + torch.clamp(cls_score.exp(), max=INF) +
+            torch.clamp(objectness.exp(), max=INF))
+        normalized_cls_score = normalized_cls_score.view(N, -1, H, W)
+        return normalized_cls_score, bbox_reg
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (batch, num_anchors * num_classes, h, w)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (batch, num_anchors * 4, h, w)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == 1
+        assert self.prior_generator.num_levels == 1
+
+        device = cls_scores[0].device
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+
+        # The output level is always 1
+        anchor_list = [anchors[0] for anchors in anchor_list]
+        valid_flag_list = [valid_flags[0] for valid_flags in valid_flag_list]
+
+        cls_scores_list = levels_to_images(cls_scores)
+        bbox_preds_list = levels_to_images(bbox_preds)
+
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            cls_scores_list,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (batch_labels, batch_label_weights, num_total_pos, num_total_neg,
+         batch_bbox_weights, batch_pos_predicted_boxes,
+         batch_target_boxes) = cls_reg_targets
+
+        flatten_labels = batch_labels.reshape(-1)
+        batch_label_weights = batch_label_weights.reshape(-1)
+        cls_score = cls_scores[0].permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+
+        num_total_samples = (num_total_pos +
+                             num_total_neg) if self.sampling else num_total_pos
+        num_total_samples = reduce_mean(
+            cls_score.new_tensor(num_total_samples)).clamp_(1.0).item()
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score,
+            flatten_labels,
+            batch_label_weights,
+            avg_factor=num_total_samples)
+
+        # regression loss
+        if batch_pos_predicted_boxes.shape[0] == 0:
+            # no pos sample
+            loss_bbox = batch_pos_predicted_boxes.sum() * 0
+        else:
+            loss_bbox = self.loss_bbox(
+                batch_pos_predicted_boxes,
+                batch_target_boxes,
+                batch_bbox_weights.float(),
+                avg_factor=num_total_samples)
+
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores_list (list[Tensor])： Classification scores of
+                each image. each is a 4D-tensor, the shape is
+                (h * w, num_anchors * num_classes).
+            bbox_preds_list (list[Tensor])： Bbox preds of each image.
+                each is a 4D-tensor, the shape is (h * w, num_anchors * 4).
+            anchor_list (list[Tensor]): Anchors of each image. Each element of
+                is a tensor of shape (h * w * num_anchors, 4).
+            valid_flag_list (list[Tensor]): Valid flags of each image. Each
+               element of is a tensor of shape (h * w * num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - batch_labels (Tensor): Label of all images. Each element \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - batch_label_weights (Tensor): Label weights of all images \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        results = multi_apply(
+            self._get_targets_single,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes_list,
+            gt_bboxes_ignore_list,
+            gt_labels_list,
+            img_metas,
+            label_channels=label_channels,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = results[:5]
+        rest_results = list(results[5:])  # user-added return values
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+
+        batch_labels = torch.stack(all_labels, 0)
+        batch_label_weights = torch.stack(all_label_weights, 0)
+
+        res = (batch_labels, batch_label_weights, num_total_pos, num_total_neg)
+        for i, rests in enumerate(rest_results):  # user-added return values
+            rest_results[i] = torch.cat(rests, 0)
+
+        return res + tuple(rest_results)
+
+    def _get_targets_single(self,
+                            bbox_preds,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            bbox_preds (Tensor): Bbox prediction of the image, which
+                shape is (h * w ,4)
+            flat_anchors (Tensor): Anchors of the image, which shape is
+                (h * w * num_anchors ,4)
+            valid_flags (Tensor): Valid flags of the image, which shape is
+                (h * w * num_anchors,).
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            img_meta (dict): Meta info of the image.
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                labels (Tensor): Labels of image, which shape is
+                    (h * w * num_anchors, ).
+                label_weights (Tensor): Label weights of image, which shape is
+                    (h * w * num_anchors, ).
+                pos_inds (Tensor): Pos index of image.
+                neg_inds (Tensor): Neg index of image.
+                sampling_result (obj:`SamplingResult`): Sampling result.
+                pos_bbox_weights (Tensor): The Weight of using to calculate
+                    the bbox branch loss, which shape is (num, ).
+                pos_predicted_boxes (Tensor): boxes predicted value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+                pos_target_boxes (Tensor): boxes target value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 8
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bbox_preds = bbox_preds[inside_flags, :]
+
+        # decoded bbox
+        decoder_bbox_preds = self.bbox_coder.decode(anchors, bbox_preds)
+        assign_result = self.assigner.assign(
+            decoder_bbox_preds, anchors, gt_bboxes, gt_bboxes_ignore,
+            None if self.sampling else gt_labels)
+
+        pos_bbox_weights = assign_result.get_extra_property('pos_idx')
+        pos_predicted_boxes = assign_result.get_extra_property(
+            'pos_predicted_boxes')
+        pos_target_boxes = assign_result.get_extra_property('target_boxes')
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+        num_valid_anchors = anchors.shape[0]
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+
+        return (labels, label_weights, pos_inds, neg_inds, sampling_result,
+                pos_bbox_weights, pos_predicted_boxes, pos_target_boxes)
diff --git a/mmdet/models/dense_heads/yolox_head.py b/mmdet/models/dense_heads/yolox_head.py
new file mode 100755
index 0000000..f317e14
--- /dev/null
+++ b/mmdet/models/dense_heads/yolox_head.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
+                      bias_init_with_prob)
+from mmcv.ops.nms import batched_nms
+from mmcv.runner import force_fp32
+
+from mmdet.core import (MlvlPointGenerator, bbox_xyxy_to_cxcywh,
+                        build_assigner, build_sampler, multi_apply,
+                        reduce_mean)
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class YOLOXHead(BaseDenseHead, BBoxTestMixin):
+    """YOLOXHead head used in `YOLOX <https://arxiv.org/abs/2107.08430>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels in stacking convs.
+            Default: 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Default: 2.
+        strides (tuple): Downsample factor of each feature map.
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer. Default: None.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_obj (dict): Config of objectness loss.
+        loss_l1 (dict): Config of L1 loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 stacked_convs=2,
+                 strides=[8, 16, 32],
+                 use_depthwise=False,
+                 dcn_on_last_conv=False,
+                 conv_bias='auto',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='IoULoss',
+                     mode='square',
+                     eps=1e-16,
+                     reduction='sum',
+                     loss_weight=5.0),
+                 loss_obj=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.use_depthwise = use_depthwise
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.use_sigmoid_cls = True
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_obj = build_loss(loss_obj)
+
+        self.use_l1 = False  # This flag will be modified by hooks.
+        self.loss_l1 = build_loss(loss_l1)
+
+        self.prior_generator = MlvlPointGenerator(strides, offset=0)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.fp16_enabled = False
+        self._init_layers()
+
+    def _init_layers(self):
+        self.multi_level_cls_convs = nn.ModuleList()
+        self.multi_level_reg_convs = nn.ModuleList()
+        self.multi_level_conv_cls = nn.ModuleList()
+        self.multi_level_conv_reg = nn.ModuleList()
+        self.multi_level_conv_obj = nn.ModuleList()
+        for _ in self.strides:
+            self.multi_level_cls_convs.append(self._build_stacked_convs())
+            self.multi_level_reg_convs.append(self._build_stacked_convs())
+            conv_cls, conv_reg, conv_obj = self._build_predictor()
+            self.multi_level_conv_cls.append(conv_cls)
+            self.multi_level_conv_reg.append(conv_reg)
+            self.multi_level_conv_obj.append(conv_obj)
+
+    def _build_stacked_convs(self):
+        """Initialize conv layers of a single level head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        stacked_convs = []
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            stacked_convs.append(
+                conv(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.conv_bias))
+        return nn.Sequential(*stacked_convs)
+
+    def _build_predictor(self):
+        """Initialize predictor layers of a single level head."""
+        conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        conv_reg = nn.Conv2d(self.feat_channels, 4, 1)
+        conv_obj = nn.Conv2d(self.feat_channels, 1, 1)
+        return conv_cls, conv_reg, conv_obj
+
+    def init_weights(self):
+        super(YOLOXHead, self).init_weights()
+        # Use prior in model initialization to improve stability
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj in zip(self.multi_level_conv_cls,
+                                      self.multi_level_conv_obj):
+            conv_cls.bias.data.fill_(bias_init)
+            conv_obj.bias.data.fill_(bias_init)
+
+    def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg,
+                       conv_obj):
+        """Forward feature of a single scale level."""
+
+        cls_feat = cls_convs(x)
+        reg_feat = reg_convs(x)
+
+        cls_score = conv_cls(cls_feat)
+        bbox_pred = conv_reg(reg_feat)
+        objectness = conv_obj(reg_feat)
+
+        return cls_score, bbox_pred, objectness
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            tuple[Tensor]: A tuple of multi-level predication map, each is a
+                4D-tensor of shape (batch_size, 5+num_classes, height, width).
+        """
+
+        return multi_apply(self.forward_single, feats,
+                           self.multi_level_cls_convs,
+                           self.multi_level_reg_convs,
+                           self.multi_level_conv_cls,
+                           self.multi_level_conv_reg,
+                           self.multi_level_conv_obj)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'objectnesses'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   objectnesses,
+                   img_metas=None,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network outputs of a batch into bbox results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            img_metas (list[dict], Optional): Image meta info. Default None.
+            cfg (mmcv.Config, Optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.  Default None.
+            rescale (bool): If True, return boxes in original image space.
+                Default False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default True.
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of
+                the corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factors = np.array(
+            [img_meta['scale_factor'] for img_meta in img_metas])
+
+        num_imgs = len(img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        flatten_priors = torch.cat(mlvl_priors)
+
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        if rescale:
+            flatten_bboxes[..., :4] /= flatten_bboxes.new_tensor(
+                scale_factors).unsqueeze(1)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_scores = flatten_cls_scores[img_id]
+            score_factor = flatten_objectness[img_id]
+            bboxes = flatten_bboxes[img_id]
+
+            result_list.append(
+                self._bboxes_nms(cls_scores, bboxes, score_factor, cfg))
+
+        return result_list
+
+    def _bbox_decode(self, priors, bbox_preds):
+        xys = (bbox_preds[..., :2] * priors[:, 2:]) + priors[:, :2]
+        whs = bbox_preds[..., 2:].exp() * priors[:, 2:]
+
+        tl_x = (xys[..., 0] - whs[..., 0] / 2)
+        tl_y = (xys[..., 1] - whs[..., 1] / 2)
+        br_x = (xys[..., 0] + whs[..., 0] / 2)
+        br_y = (xys[..., 1] + whs[..., 1] / 2)
+
+        decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
+        return decoded_bboxes
+
+    def _bboxes_nms(self, cls_scores, bboxes, score_factor, cfg):
+        max_scores, labels = torch.max(cls_scores, 1)
+        valid_mask = score_factor * max_scores >= cfg.score_thr
+
+        bboxes = bboxes[valid_mask]
+        scores = max_scores[valid_mask] * score_factor[valid_mask]
+        labels = labels[valid_mask]
+
+        if labels.numel() == 0:
+            return bboxes, labels
+        else:
+            dets, keep = batched_nms(bboxes, scores, labels, cfg.nms)
+            return dets, labels[keep]
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'objectnesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             objectnesses,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+        """
+        num_imgs = len(img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.cls_out_channels)
+            for cls_pred in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1)
+        flatten_priors = torch.cat(mlvl_priors)
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        (pos_masks, cls_targets, obj_targets, bbox_targets, l1_targets,
+         num_fg_imgs) = multi_apply(
+             self._get_target_single, flatten_cls_preds.detach(),
+             flatten_objectness.detach(),
+             flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1),
+             flatten_bboxes.detach(), gt_bboxes, gt_labels)
+
+        # The experimental results show that ‘reduce_mean’ can improve
+        # performance on the COCO dataset.
+        num_pos = torch.tensor(
+            sum(num_fg_imgs),
+            dtype=torch.float,
+            device=flatten_cls_preds.device)
+        num_total_samples = max(reduce_mean(num_pos), 1.0)
+
+        pos_masks = torch.cat(pos_masks, 0)
+        cls_targets = torch.cat(cls_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        if self.use_l1:
+            l1_targets = torch.cat(l1_targets, 0)
+
+        loss_bbox = self.loss_bbox(
+            flatten_bboxes.view(-1, 4)[pos_masks],
+            bbox_targets) / num_total_samples
+        loss_obj = self.loss_obj(flatten_objectness.view(-1, 1),
+                                 obj_targets) / num_total_samples
+        loss_cls = self.loss_cls(
+            flatten_cls_preds.view(-1, self.num_classes)[pos_masks],
+            cls_targets) / num_total_samples
+
+        loss_dict = dict(
+            loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj)
+
+        if self.use_l1:
+            loss_l1 = self.loss_l1(
+                flatten_bbox_preds.view(-1, 4)[pos_masks],
+                l1_targets) / num_total_samples
+            loss_dict.update(loss_l1=loss_l1)
+
+        return loss_dict
+
+    @torch.no_grad()
+    def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes,
+                           gt_bboxes, gt_labels):
+        """Compute classification, regression, and objectness targets for
+        priors in a single image.
+        Args:
+            cls_preds (Tensor): Classification predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            objectness (Tensor): Objectness predictions of one image,
+                a 1D-Tensor with shape [num_priors]
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            decoded_bboxes (Tensor): Decoded bboxes predictions of one image,
+                a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y,
+                br_x, br_y] format.
+            gt_bboxes (Tensor): Ground truth bboxes of one image, a 2D-Tensor
+                with shape [num_gts, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth labels of one image, a Tensor
+                with shape [num_gts].
+        """
+
+        num_priors = priors.size(0)
+        num_gts = gt_labels.size(0)
+        gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype)
+        # No target
+        if num_gts == 0:
+            cls_target = cls_preds.new_zeros((0, self.num_classes))
+            bbox_target = cls_preds.new_zeros((0, 4))
+            l1_target = cls_preds.new_zeros((0, 4))
+            obj_target = cls_preds.new_zeros((num_priors, 1))
+            foreground_mask = cls_preds.new_zeros(num_priors).bool()
+            return (foreground_mask, cls_target, obj_target, bbox_target,
+                    l1_target, 0)
+
+        # YOLOX uses center priors with 0.5 offset to assign targets,
+        # but use center priors without offset to regress bboxes.
+        offset_priors = torch.cat(
+            [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1)
+
+        assign_result = self.assigner.assign(
+            cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid(),
+            offset_priors, decoded_bboxes, gt_bboxes, gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, priors, gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        num_pos_per_img = pos_inds.size(0)
+
+        pos_ious = assign_result.max_overlaps[pos_inds]
+        # IOU aware classification score
+        cls_target = F.one_hot(sampling_result.pos_gt_labels,
+                               self.num_classes) * pos_ious.unsqueeze(-1)
+        obj_target = torch.zeros_like(objectness).unsqueeze(-1)
+        obj_target[pos_inds] = 1
+        bbox_target = sampling_result.pos_gt_bboxes
+        l1_target = cls_preds.new_zeros((num_pos_per_img, 4))
+        if self.use_l1:
+            l1_target = self._get_l1_target(l1_target, bbox_target,
+                                            priors[pos_inds])
+        foreground_mask = torch.zeros_like(objectness).to(torch.bool)
+        foreground_mask[pos_inds] = 1
+        return (foreground_mask, cls_target, obj_target, bbox_target,
+                l1_target, num_pos_per_img)
+
+    def _get_l1_target(self, l1_target, gt_bboxes, priors, eps=1e-8):
+        """Convert gt bboxes to center offset and log width height."""
+        gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes)
+        l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:]
+        l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps)
+        return l1_target
diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py
new file mode 100755
index 0000000..a0a89b8
--- /dev/null
+++ b/mmdet/models/detectors/__init__.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .atss import ATSS
+from .autoassign import AutoAssign
+from .base import BaseDetector
+from .cascade_rcnn import CascadeRCNN
+from .centernet import CenterNet
+from .cornernet import CornerNet
+from .ddod import DDOD
+from .deformable_detr import DeformableDETR
+from .detr import DETR
+from .fast_rcnn import FastRCNN
+from .faster_rcnn import FasterRCNN
+from .fcos import FCOS
+from .fovea import FOVEA
+from .fsaf import FSAF
+from .gfl import GFL
+from .grid_rcnn import GridRCNN
+from .htc import HybridTaskCascade
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+from .lad import LAD
+from .mask2former import Mask2Former
+from .mask_rcnn import MaskRCNN
+from .mask_scoring_rcnn import MaskScoringRCNN
+from .maskformer import MaskFormer
+from .nasfcos import NASFCOS
+from .paa import PAA
+from .panoptic_fpn import PanopticFPN
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+from .point_rend import PointRend
+from .queryinst import QueryInst
+from .reppoints_detector import RepPointsDetector
+from .retinanet import RetinaNet
+from .rpn import RPN
+from .scnet import SCNet
+from .single_stage import SingleStageDetector
+from .solo import SOLO
+from .solov2 import SOLOv2
+from .sparse_rcnn import SparseRCNN
+from .tood import TOOD
+from .trident_faster_rcnn import TridentFasterRCNN
+from .two_stage import TwoStageDetector
+from .vfnet import VFNet
+from .yolact import YOLACT
+from .yolo import YOLOV3
+from .yolof import YOLOF
+from .yolox import YOLOX
+
+__all__ = [
+    'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
+    'KnowledgeDistillationSingleStageDetector', 'FastRCNN', 'FasterRCNN',
+    'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', 'RetinaNet', 'FCOS',
+    'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector', 'FOVEA', 'FSAF',
+    'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA', 'YOLOV3', 'YOLACT',
+    'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN', 'SCNet', 'SOLO',
+    'SOLOv2', 'DeformableDETR', 'AutoAssign', 'YOLOF', 'CenterNet', 'YOLOX',
+    'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD',
+    'MaskFormer', 'DDOD', 'Mask2Former'
+]
diff --git a/mmdet/models/detectors/atss.py b/mmdet/models/detectors/atss.py
new file mode 100755
index 0000000..00f1acd
--- /dev/null
+++ b/mmdet/models/detectors/atss.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class ATSS(SingleStageDetector):
+    """Implementation of `ATSS <https://arxiv.org/abs/1912.02424>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ATSS, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/autoassign.py b/mmdet/models/detectors/autoassign.py
new file mode 100755
index 0000000..30ab720
--- /dev/null
+++ b/mmdet/models/detectors/autoassign.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class AutoAssign(SingleStageDetector):
+    """Implementation of `AutoAssign: Differentiable Label Assignment for Dense
+    Object Detection <https://arxiv.org/abs/2007.03496>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(AutoAssign, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                         test_cfg, pretrained)
diff --git a/mmdet/models/detectors/base.py b/mmdet/models/detectors/base.py
new file mode 100755
index 0000000..f87097b
--- /dev/null
+++ b/mmdet/models/detectors/base.py
@@ -0,0 +1,365 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import BaseModule, auto_fp16
+
+from mmdet.core.visualization import imshow_det_bboxes
+
+
+class BaseDetector(BaseModule, metaclass=ABCMeta):
+    """Base class for detectors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseDetector, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the detector has a neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    # TODO: these properties need to be carefully handled
+    # for both single stage & two stage detectors
+    @property
+    def with_shared_head(self):
+        """bool: whether the detector has a shared head in the RoI Head"""
+        return hasattr(self, 'roi_head') and self.roi_head.with_shared_head
+
+    @property
+    def with_bbox(self):
+        """bool: whether the detector has a bbox head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
+                or (hasattr(self, 'bbox_head') and self.bbox_head is not None))
+
+    @property
+    def with_mask(self):
+        """bool: whether the detector has a mask head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
+                or (hasattr(self, 'mask_head') and self.mask_head is not None))
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Extract features from images."""
+        pass
+
+    def extract_feats(self, imgs):
+        """Extract features from multiple images.
+
+        Args:
+            imgs (list[torch.Tensor]): A list of images. The images are
+                augmented from the same image but in different ways.
+
+        Returns:
+            list[torch.Tensor]: Features of different images
+        """
+        assert isinstance(imgs, list)
+        return [self.extract_feat(img) for img in imgs]
+
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys, see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            kwargs (keyword arguments): Specific to concrete implementation.
+        """
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        batch_input_shape = tuple(imgs[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta['batch_input_shape'] = batch_input_shape
+
+    async def async_simple_test(self, img, img_metas, **kwargs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def simple_test(self, img, img_metas, **kwargs):
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test function with test time augmentation."""
+        pass
+
+    async def aforward_test(self, *, img, img_metas, **kwargs):
+        for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(img)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(img)}) '
+                             f'!= num of image metas ({len(img_metas)})')
+        # TODO: remove the restriction of samples_per_gpu == 1 when prepared
+        samples_per_gpu = img[0].size(0)
+        assert samples_per_gpu == 1
+
+        if num_augs == 1:
+            return await self.async_simple_test(img[0], img_metas[0], **kwargs)
+        else:
+            raise NotImplementedError
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) '
+                             f'!= num of image meta ({len(img_metas)})')
+
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        for img, img_meta in zip(imgs, img_metas):
+            batch_size = len(img_meta)
+            for img_id in range(batch_size):
+                img_meta[img_id]['batch_input_shape'] = tuple(img.size()[-2:])
+
+        if num_augs == 1:
+            # proposals (List[List[Tensor]]): the outer list indicates
+            # test-time augs (multiscale, flip, etc.) and the inner list
+            # indicates images in a batch.
+            # The Tensor should have a shape Px4, where P is the number of
+            # proposals.
+            if 'proposals' in kwargs:
+                kwargs['proposals'] = kwargs['proposals'][0]
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            assert imgs[0].size(0) == 1, 'aug test does not support ' \
+                                         'inference with batch size ' \
+                                         f'{imgs[0].size(0)}'
+            # TODO: support test augmentation for predefined proposals
+            assert 'proposals' not in kwargs
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if torch.onnx.is_in_onnx_export():
+            assert len(img_metas) == 1
+            return self.onnx_export(img[0], img_metas[0])
+
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def _parse_losses(self, losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars contains \
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        # If the loss_vars has different length, GPUs will wait infinitely
+        if dist.is_available() and dist.is_initialized():
+            log_var_length = torch.tensor(len(log_vars), device=loss.device)
+            dist.all_reduce(log_var_length)
+            message = (f'rank {dist.get_rank()}' +
+                       f' len(log_vars): {len(log_vars)}' + ' keys: ' +
+                       ','.join(log_vars.keys()))
+            assert log_var_length == len(log_vars) * dist.get_world_size(), \
+                'loss log variables are different across GPUs!\n' + message
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def train_step(self, data, optimizer):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \
+                ``num_samples``.
+
+                - ``loss`` is a tensor for back propagation, which can be a
+                  weighted sum of multiple losses.
+                - ``log_vars`` contains all the variables to be sent to the
+                  logger.
+                - ``num_samples`` indicates the batch size (when the model is
+                  DDP, it means the batch size on each GPU), which is used for
+                  averaging the logs.
+        """
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
+
+        return outputs
+
+    def val_step(self, data, optimizer=None):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        log_vars_ = dict()
+        for loss_name, loss_value in log_vars.items():
+            k = loss_name + '_val'
+            log_vars_[k] = loss_value
+
+        outputs = dict(
+            loss=loss, log_vars=log_vars_, num_samples=len(data['img_metas']))
+
+        return outputs
+
+    def show_result(self,
+                    img,
+                    result,
+                    score_thr=0.3,
+                    bbox_color=(72, 101, 241),
+                    text_color=(72, 101, 241),
+                    mask_color=None,
+                    thickness=2,
+                    font_size=13,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor or tuple): The results to draw over `img`
+                bbox_result or (bbox_result, segm_result).
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+               The tuple of color should be in BGR order. Default: 'green'
+            text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+               The tuple of color should be in BGR order. Default: 'green'
+            mask_color (None or str or tuple(int) or :obj:`Color`):
+               Color of masks. The tuple of color should be in BGR order.
+               Default: None
+            thickness (int): Thickness of lines. Default: 2
+            font_size (int): Font size of texts. Default: 13
+            win_name (str): The window name. Default: ''
+            wait_time (float): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        if isinstance(result, tuple):
+            bbox_result, segm_result = result
+            if isinstance(segm_result, tuple):
+                segm_result = segm_result[0]  # ms rcnn
+        else:
+            bbox_result, segm_result = result, None
+        bboxes = np.vstack(bbox_result)
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_result)
+        ]
+        labels = np.concatenate(labels)
+        # draw segmentation masks
+        segms = None
+        if segm_result is not None and len(labels) > 0:  # non empty
+            segms = mmcv.concat_list(segm_result)
+            if isinstance(segms[0], torch.Tensor):
+                segms = torch.stack(segms, dim=0).detach().cpu().numpy()
+            else:
+                segms = np.stack(segms, axis=0)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        img = imshow_det_bboxes(
+            img,
+            bboxes,
+            labels,
+            segms,
+            class_names=self.CLASSES,
+            score_thr=score_thr,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img
+
+    def onnx_export(self, img, img_metas):
+        raise NotImplementedError(f'{self.__class__.__name__} does '
+                                  f'not support ONNX EXPORT')
diff --git a/mmdet/models/detectors/cascade_rcnn.py b/mmdet/models/detectors/cascade_rcnn.py
new file mode 100755
index 0000000..d8c7382
--- /dev/null
+++ b/mmdet/models/detectors/cascade_rcnn.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class CascadeRCNN(TwoStageDetector):
+    r"""Implementation of `Cascade R-CNN: Delving into High Quality Object
+    Detection <https://arxiv.org/abs/1906.09756>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(CascadeRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def show_result(self, data, result, **kwargs):
+        """Show prediction results of the detector.
+
+        Args:
+            data (str or np.ndarray): Image filename or loaded image.
+            result (Tensor or tuple): The results to draw over `img`
+                bbox_result or (bbox_result, segm_result).
+
+        Returns:
+            np.ndarray: The image with bboxes drawn on it.
+        """
+        if self.with_mask:
+            ms_bbox_result, ms_segm_result = result
+            if isinstance(ms_bbox_result, dict):
+                result = (ms_bbox_result['ensemble'],
+                          ms_segm_result['ensemble'])
+        else:
+            if isinstance(result, dict):
+                result = result['ensemble']
+        return super(CascadeRCNN, self).show_result(data, result, **kwargs)
diff --git a/mmdet/models/detectors/centernet.py b/mmdet/models/detectors/centernet.py
new file mode 100755
index 0000000..e1e3fd3
--- /dev/null
+++ b/mmdet/models/detectors/centernet.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import bbox2result
+from mmdet.models.builder import DETECTORS
+from ...core.utils import flip_tensor
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class CenterNet(SingleStageDetector):
+    """Implementation of CenterNet(Objects as Points)
+
+    <https://arxiv.org/abs/1904.07850>.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(CenterNet, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                        test_cfg, pretrained, init_cfg)
+
+    def merge_aug_results(self, aug_results, with_nms):
+        """Merge augmented detection bboxes and score.
+
+        Args:
+            aug_results (list[list[Tensor]]): Det_bboxes and det_labels of each
+                image.
+            with_nms (bool): If True, do nms before return boxes.
+
+        Returns:
+            tuple: (out_bboxes, out_labels)
+        """
+        recovered_bboxes, aug_labels = [], []
+        for single_result in aug_results:
+            recovered_bboxes.append(single_result[0][0])
+            aug_labels.append(single_result[0][1])
+
+        bboxes = torch.cat(recovered_bboxes, dim=0).contiguous()
+        labels = torch.cat(aug_labels).contiguous()
+        if with_nms:
+            out_bboxes, out_labels = self.bbox_head._bboxes_nms(
+                bboxes, labels, self.bbox_head.test_cfg)
+        else:
+            out_bboxes, out_labels = bboxes, labels
+
+        return out_bboxes, out_labels
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Augment testing of CenterNet. Aug test must have flipped image pair,
+        and unlike CornerNet, it will perform an averaging operation on the
+        feature map instead of detecting bbox.
+
+        Args:
+            imgs (list[Tensor]): Augmented images.
+            img_metas (list[list[dict]]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: True.
+
+        Note:
+            ``imgs`` must including flipped image pairs.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        img_inds = list(range(len(imgs)))
+        assert img_metas[0][0]['flip'] + img_metas[1][0]['flip'], (
+            'aug test must have flipped image pair')
+        aug_results = []
+        for ind, flip_ind in zip(img_inds[0::2], img_inds[1::2]):
+            flip_direction = img_metas[flip_ind][0]['flip_direction']
+            img_pair = torch.cat([imgs[ind], imgs[flip_ind]])
+            x = self.extract_feat(img_pair)
+            center_heatmap_preds, wh_preds, offset_preds = self.bbox_head(x)
+            assert len(center_heatmap_preds) == len(wh_preds) == len(
+                offset_preds) == 1
+
+            # Feature map averaging
+            center_heatmap_preds[0] = (
+                center_heatmap_preds[0][0:1] +
+                flip_tensor(center_heatmap_preds[0][1:2], flip_direction)) / 2
+            wh_preds[0] = (wh_preds[0][0:1] +
+                           flip_tensor(wh_preds[0][1:2], flip_direction)) / 2
+
+            bbox_list = self.bbox_head.get_bboxes(
+                center_heatmap_preds,
+                wh_preds, [offset_preds[0][0:1]],
+                img_metas[ind],
+                rescale=rescale,
+                with_nms=False)
+            aug_results.append(bbox_list)
+
+        nms_cfg = self.bbox_head.test_cfg.get('nms_cfg', None)
+        if nms_cfg is None:
+            with_nms = False
+        else:
+            with_nms = True
+        bbox_list = [self.merge_aug_results(aug_results, with_nms)]
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in bbox_list
+        ]
+        return bbox_results
diff --git a/mmdet/models/detectors/cornernet.py b/mmdet/models/detectors/cornernet.py
new file mode 100755
index 0000000..ce921cc
--- /dev/null
+++ b/mmdet/models/detectors/cornernet.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import bbox2result, bbox_mapping_back
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class CornerNet(SingleStageDetector):
+    """CornerNet.
+
+    This detector is the implementation of the paper `CornerNet: Detecting
+    Objects as Paired Keypoints <https://arxiv.org/abs/1808.01244>`_ .
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(CornerNet, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                        test_cfg, pretrained, init_cfg)
+
+    def merge_aug_results(self, aug_results, img_metas):
+        """Merge augmented detection bboxes and score.
+
+        Args:
+            aug_results (list[list[Tensor]]): Det_bboxes and det_labels of each
+                image.
+            img_metas (list[list[dict]]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: (bboxes, labels)
+        """
+        recovered_bboxes, aug_labels = [], []
+        for bboxes_labels, img_info in zip(aug_results, img_metas):
+            img_shape = img_info[0]['img_shape']  # using shape before padding
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            bboxes, labels = bboxes_labels
+            bboxes, scores = bboxes[:, :4], bboxes[:, -1:]
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
+            recovered_bboxes.append(torch.cat([bboxes, scores], dim=-1))
+            aug_labels.append(labels)
+
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        labels = torch.cat(aug_labels)
+
+        if bboxes.shape[0] > 0:
+            out_bboxes, out_labels = self.bbox_head._bboxes_nms(
+                bboxes, labels, self.bbox_head.test_cfg)
+        else:
+            out_bboxes, out_labels = bboxes, labels
+
+        return out_bboxes, out_labels
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Augment testing of CornerNet.
+
+        Args:
+            imgs (list[Tensor]): Augmented images.
+            img_metas (list[list[dict]]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+
+        Note:
+            ``imgs`` must including flipped image pairs.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        img_inds = list(range(len(imgs)))
+
+        assert img_metas[0][0]['flip'] + img_metas[1][0]['flip'], (
+            'aug test must have flipped image pair')
+        aug_results = []
+        for ind, flip_ind in zip(img_inds[0::2], img_inds[1::2]):
+            img_pair = torch.cat([imgs[ind], imgs[flip_ind]])
+            x = self.extract_feat(img_pair)
+            outs = self.bbox_head(x)
+            bbox_list = self.bbox_head.get_bboxes(
+                *outs, [img_metas[ind], img_metas[flip_ind]], False, False)
+            aug_results.append(bbox_list[0])
+            aug_results.append(bbox_list[1])
+
+        bboxes, labels = self.merge_aug_results(aug_results, img_metas)
+        bbox_results = bbox2result(bboxes, labels, self.bbox_head.num_classes)
+
+        return [bbox_results]
diff --git a/mmdet/models/detectors/ddod.py b/mmdet/models/detectors/ddod.py
new file mode 100755
index 0000000..2ae0a74
--- /dev/null
+++ b/mmdet/models/detectors/ddod.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class DDOD(SingleStageDetector):
+    """Implementation of `DDOD <https://arxiv.org/pdf/2107.02963.pdf>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(DDOD, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/deformable_detr.py b/mmdet/models/detectors/deformable_detr.py
new file mode 100755
index 0000000..b1f1642
--- /dev/null
+++ b/mmdet/models/detectors/deformable_detr.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .detr import DETR
+
+
+@DETECTORS.register_module()
+class DeformableDETR(DETR):
+
+    def __init__(self, *args, **kwargs):
+        super(DETR, self).__init__(*args, **kwargs)
diff --git a/mmdet/models/detectors/detr.py b/mmdet/models/detectors/detr.py
new file mode 100755
index 0000000..06d7691
--- /dev/null
+++ b/mmdet/models/detectors/detr.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class DETR(SingleStageDetector):
+    r"""Implementation of `DETR: End-to-End Object Detection with
+    Transformers <https://arxiv.org/pdf/2005.12872>`_"""
+
+    def __init__(self,
+                 backbone,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(DETR, self).__init__(backbone, None, bbox_head, train_cfg,
+                                   test_cfg, pretrained, init_cfg)
+
+    # over-write `forward_dummy` because:
+    # the forward of bbox_head requires img_metas
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        warnings.warn('Warning! MultiheadAttention in DETR does not '
+                      'support flops computation! Do not use the '
+                      'results in your papers!')
+
+        batch_size, _, height, width = img.shape
+        dummy_img_metas = [
+            dict(
+                batch_input_shape=(height, width),
+                img_shape=(height, width, 3)) for _ in range(batch_size)
+        ]
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x, dummy_img_metas)
+        return outs
+
+    # over-write `onnx_export` because:
+    # (1) the forward of bbox_head requires img_metas
+    # (2) the different behavior (e.g. construction of `masks`) between
+    # torch and ONNX model, during the forward of bbox_head
+    def onnx_export(self, img, img_metas):
+        """Test function for exporting to ONNX, without test time augmentation.
+
+        Args:
+            img (torch.Tensor): input images.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        x = self.extract_feat(img)
+        # forward of this head requires img_metas
+        outs = self.bbox_head.forward_onnx(x, img_metas)
+        # get shape as tensor
+        img_shape = torch._shape_as_tensor(img)[2:]
+        img_metas[0]['img_shape_for_onnx'] = img_shape
+
+        det_bboxes, det_labels = self.bbox_head.onnx_export(*outs, img_metas)
+
+        return det_bboxes, det_labels
diff --git a/mmdet/models/detectors/fast_rcnn.py b/mmdet/models/detectors/fast_rcnn.py
new file mode 100755
index 0000000..7aebe15
--- /dev/null
+++ b/mmdet/models/detectors/fast_rcnn.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class FastRCNN(TwoStageDetector):
+    """Implementation of `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_"""
+
+    def __init__(self,
+                 backbone,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(FastRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def forward_test(self, imgs, img_metas, proposals, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+            proposals (List[List[Tensor]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. The Tensor should have a shape Px4, where
+                P is the number of proposals.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) '
+                             f'!= num of image meta ({len(img_metas)})')
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], proposals[0],
+                                    **kwargs)
+        else:
+            # TODO: support test-time augmentation
+            assert NotImplementedError
diff --git a/mmdet/models/detectors/faster_rcnn.py b/mmdet/models/detectors/faster_rcnn.py
new file mode 100755
index 0000000..70fb662
--- /dev/null
+++ b/mmdet/models/detectors/faster_rcnn.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class FasterRCNN(TwoStageDetector):
+    """Implementation of `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_"""
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(FasterRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
diff --git a/mmdet/models/detectors/fcos.py b/mmdet/models/detectors/fcos.py
new file mode 100755
index 0000000..d985bd0
--- /dev/null
+++ b/mmdet/models/detectors/fcos.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class FCOS(SingleStageDetector):
+    """Implementation of `FCOS <https://arxiv.org/abs/1904.01355>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(FCOS, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/fovea.py b/mmdet/models/detectors/fovea.py
new file mode 100755
index 0000000..6fd908c
--- /dev/null
+++ b/mmdet/models/detectors/fovea.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class FOVEA(SingleStageDetector):
+    """Implementation of `FoveaBox <https://arxiv.org/abs/1904.03797>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(FOVEA, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                    test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/fsaf.py b/mmdet/models/detectors/fsaf.py
new file mode 100755
index 0000000..81ed1bd
--- /dev/null
+++ b/mmdet/models/detectors/fsaf.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class FSAF(SingleStageDetector):
+    """Implementation of `FSAF <https://arxiv.org/abs/1903.00621>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(FSAF, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/gfl.py b/mmdet/models/detectors/gfl.py
new file mode 100755
index 0000000..4628e2e
--- /dev/null
+++ b/mmdet/models/detectors/gfl.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class GFL(SingleStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(GFL, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                  test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/grid_rcnn.py b/mmdet/models/detectors/grid_rcnn.py
new file mode 100755
index 0000000..bba7873
--- /dev/null
+++ b/mmdet/models/detectors/grid_rcnn.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class GridRCNN(TwoStageDetector):
+    """Grid R-CNN.
+
+    This detector is the implementation of:
+    - Grid R-CNN (https://arxiv.org/abs/1811.12030)
+    - Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688)
+    """
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(GridRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
diff --git a/mmdet/models/detectors/htc.py b/mmdet/models/detectors/htc.py
new file mode 100755
index 0000000..f7c9533
--- /dev/null
+++ b/mmdet/models/detectors/htc.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .cascade_rcnn import CascadeRCNN
+
+
+@DETECTORS.register_module()
+class HybridTaskCascade(CascadeRCNN):
+    """Implementation of `HTC <https://arxiv.org/abs/1901.07518>`_"""
+
+    def __init__(self, **kwargs):
+        super(HybridTaskCascade, self).__init__(**kwargs)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the detector has a semantic head"""
+        return self.roi_head.with_semantic
diff --git a/mmdet/models/detectors/kd_one_stage.py b/mmdet/models/detectors/kd_one_stage.py
new file mode 100755
index 0000000..fb66b51
--- /dev/null
+++ b/mmdet/models/detectors/kd_one_stage.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+
+import mmcv
+import torch
+from mmcv.runner import load_checkpoint
+
+from .. import build_detector
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class KnowledgeDistillationSingleStageDetector(SingleStageDetector):
+    r"""Implementation of `Distilling the Knowledge in a Neural Network.
+    <https://arxiv.org/abs/1503.02531>`_.
+
+    Args:
+        teacher_config (str | dict): Config file path
+            or the config object of teacher model.
+        teacher_ckpt (str, optional): Checkpoint path of teacher model.
+            If left as None, the model will not load any weights.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 teacher_config,
+                 teacher_ckpt=None,
+                 eval_teacher=True,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super().__init__(backbone, neck, bbox_head, train_cfg, test_cfg,
+                         pretrained)
+        self.eval_teacher = eval_teacher
+        # Build teacher model
+        if isinstance(teacher_config, (str, Path)):
+            teacher_config = mmcv.Config.fromfile(teacher_config)
+        self.teacher_model = build_detector(teacher_config['model'])
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(img)
+        with torch.no_grad():
+            teacher_x = self.teacher_model.extract_feat(img)
+            out_teacher = self.teacher_model.bbox_head(teacher_x)
+        losses = self.bbox_head.forward_train(x, out_teacher, img_metas,
+                                              gt_bboxes, gt_labels,
+                                              gt_bboxes_ignore)
+        return losses
+
+    def cuda(self, device=None):
+        """Since teacher_model is registered as a plain object, it is necessary
+        to put the teacher model to cuda when calling cuda function."""
+        self.teacher_model.cuda(device=device)
+        return super().cuda(device=device)
+
+    def train(self, mode=True):
+        """Set the same train mode for teacher and student model."""
+        if self.eval_teacher:
+            self.teacher_model.train(False)
+        else:
+            self.teacher_model.train(mode)
+        super().train(mode)
+
+    def __setattr__(self, name, value):
+        """Set attribute, i.e. self.name = value
+
+        This reloading prevent the teacher model from being registered as a
+        nn.Module. The teacher module is registered as a plain object, so that
+        the teacher parameters will not show up when calling
+        ``self.parameters``, ``self.modules``, ``self.children`` methods.
+        """
+        if name == 'teacher_model':
+            object.__setattr__(self, name, value)
+        else:
+            super().__setattr__(name, value)
diff --git a/mmdet/models/detectors/lad.py b/mmdet/models/detectors/lad.py
new file mode 100755
index 0000000..c6cc1e0
--- /dev/null
+++ b/mmdet/models/detectors/lad.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.runner import load_checkpoint
+
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+
+
+@DETECTORS.register_module()
+class LAD(KnowledgeDistillationSingleStageDetector):
+    """Implementation of `LAD <https://arxiv.org/pdf/2108.10520.pdf>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 teacher_backbone,
+                 teacher_neck,
+                 teacher_bbox_head,
+                 teacher_ckpt,
+                 eval_teacher=True,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(KnowledgeDistillationSingleStageDetector,
+              self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg,
+                             pretrained)
+        self.eval_teacher = eval_teacher
+        self.teacher_model = nn.Module()
+        self.teacher_model.backbone = build_backbone(teacher_backbone)
+        if teacher_neck is not None:
+            self.teacher_model.neck = build_neck(teacher_neck)
+        teacher_bbox_head.update(train_cfg=train_cfg)
+        teacher_bbox_head.update(test_cfg=test_cfg)
+        self.teacher_model.bbox_head = build_head(teacher_bbox_head)
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    @property
+    def with_teacher_neck(self):
+        """bool: whether the detector has a teacher_neck"""
+        return hasattr(self.teacher_model, 'neck') and \
+            self.teacher_model.neck is not None
+
+    def extract_teacher_feat(self, img):
+        """Directly extract teacher features from the backbone+neck."""
+        x = self.teacher_model.backbone(img)
+        if self.with_teacher_neck:
+            x = self.teacher_model.neck(x)
+        return x
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # get label assignment from the teacher
+        with torch.no_grad():
+            x_teacher = self.extract_teacher_feat(img)
+            outs_teacher = self.teacher_model.bbox_head(x_teacher)
+            label_assignment_results = \
+                self.teacher_model.bbox_head.get_label_assignment(
+                    *outs_teacher, gt_bboxes, gt_labels, img_metas,
+                    gt_bboxes_ignore)
+
+        # the student use the label assignment from the teacher to learn
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, label_assignment_results,
+                                              img_metas, gt_bboxes, gt_labels,
+                                              gt_bboxes_ignore)
+        return losses
diff --git a/mmdet/models/detectors/mask2former.py b/mmdet/models/detectors/mask2former.py
new file mode 100755
index 0000000..b9ad2ed
--- /dev/null
+++ b/mmdet/models/detectors/mask2former.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .maskformer import MaskFormer
+
+
+@DETECTORS.register_module()
+class Mask2Former(MaskFormer):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 panoptic_head=None,
+                 panoptic_fusion_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super().__init__(
+            backbone,
+            neck=neck,
+            panoptic_head=panoptic_head,
+            panoptic_fusion_head=panoptic_fusion_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
diff --git a/mmdet/models/detectors/mask_rcnn.py b/mmdet/models/detectors/mask_rcnn.py
new file mode 100755
index 0000000..c68489f
--- /dev/null
+++ b/mmdet/models/detectors/mask_rcnn.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class MaskRCNN(TwoStageDetector):
+    """Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MaskRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
diff --git a/mmdet/models/detectors/mask_scoring_rcnn.py b/mmdet/models/detectors/mask_scoring_rcnn.py
new file mode 100755
index 0000000..5f55656
--- /dev/null
+++ b/mmdet/models/detectors/mask_scoring_rcnn.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class MaskScoringRCNN(TwoStageDetector):
+    """Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MaskScoringRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py
new file mode 100755
index 0000000..3d251ad
--- /dev/null
+++ b/mmdet/models/detectors/maskformer.py
@@ -0,0 +1,258 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import mmcv
+import numpy as np
+
+from mmdet.core import INSTANCE_OFFSET, bbox2result
+from mmdet.core.visualization import imshow_det_bboxes
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class MaskFormer(SingleStageDetector):
+    r"""Implementation of `Per-Pixel Classification is
+    NOT All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 panoptic_head=None,
+                 panoptic_fusion_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super(SingleStageDetector, self).__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+
+        panoptic_head_ = copy.deepcopy(panoptic_head)
+        panoptic_head_.update(train_cfg=train_cfg)
+        panoptic_head_.update(test_cfg=test_cfg)
+        self.panoptic_head = build_head(panoptic_head_)
+
+        panoptic_fusion_head_ = copy.deepcopy(panoptic_fusion_head)
+        panoptic_fusion_head_.update(test_cfg=test_cfg)
+        self.panoptic_fusion_head = build_head(panoptic_fusion_head_)
+
+        self.num_things_classes = self.panoptic_head.num_things_classes
+        self.num_stuff_classes = self.panoptic_head.num_stuff_classes
+        self.num_classes = self.panoptic_head.num_classes
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        # BaseDetector.show_result default for instance segmentation
+        if self.num_stuff_classes > 0:
+            self.show_result = self._show_pan_result
+
+    def forward_dummy(self, img, img_metas):
+        """Used for computing network flops. See
+        `mmdetection/tools/analysis_tools/get_flops.py`
+
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[Dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+        """
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        outs = self.panoptic_head(x, img_metas)
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_masks,
+                      gt_semantic_seg=None,
+                      gt_bboxes_ignore=None,
+                      **kargs):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[Dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box.
+            gt_masks (list[BitmapMasks]): true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+            gt_semantic_seg (list[tensor]): semantic segmentation mask for
+                images for panoptic segmentation.
+                Defaults to None for instance segmentation.
+            gt_bboxes_ignore (list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # add batch_input_shape in img_metas
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        losses = self.panoptic_head.forward_train(x, img_metas, gt_bboxes,
+                                                  gt_labels, gt_masks,
+                                                  gt_semantic_seg,
+                                                  gt_bboxes_ignore)
+
+        return losses
+
+    def simple_test(self, imgs, img_metas, **kwargs):
+        """Test without augmentation.
+
+        Args:
+            imgs (Tensor): A batch of images.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            list[dict[str, np.array | tuple[list]] | tuple[list]]:
+                Semantic segmentation results and panoptic segmentation \
+                results of each image for panoptic segmentation, or formatted \
+                bbox and mask results of each image for instance segmentation.
+
+            .. code-block:: none
+
+                [
+                    # panoptic segmentation
+                    {
+                        'pan_results': np.array, # shape = [h, w]
+                        'ins_results': tuple[list],
+                        # semantic segmentation results are not supported yet
+                        'sem_results': np.array
+                    },
+                    ...
+                ]
+
+            or
+
+            .. code-block:: none
+
+                [
+                    # instance segmentation
+                    (
+                        bboxes, # list[np.array]
+                        masks # list[list[np.array]]
+                    ),
+                    ...
+                ]
+        """
+        feats = self.extract_feat(imgs)
+        mask_cls_results, mask_pred_results = self.panoptic_head.simple_test(
+            feats, img_metas, **kwargs)
+        results = self.panoptic_fusion_head.simple_test(
+            mask_cls_results, mask_pred_results, img_metas, **kwargs)
+        for i in range(len(results)):
+            if 'pan_results' in results[i]:
+                results[i]['pan_results'] = results[i]['pan_results'].detach(
+                ).cpu().numpy()
+
+            if 'ins_results' in results[i]:
+                labels_per_image, bboxes, mask_pred_binary = results[i][
+                    'ins_results']
+                bbox_results = bbox2result(bboxes, labels_per_image,
+                                           self.num_things_classes)
+                mask_results = [[] for _ in range(self.num_things_classes)]
+                for j, label in enumerate(labels_per_image):
+                    mask = mask_pred_binary[j].detach().cpu().numpy()
+                    mask_results[label].append(mask)
+                results[i]['ins_results'] = bbox_results, mask_results
+
+            assert 'sem_results' not in results[i], 'segmantic segmentation '\
+                'results are not supported yet.'
+
+        if self.num_stuff_classes == 0:
+            results = [res['ins_results'] for res in results]
+
+        return results
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError
+
+    def onnx_export(self, img, img_metas):
+        raise NotImplementedError
+
+    def _show_pan_result(self,
+                         img,
+                         result,
+                         score_thr=0.3,
+                         bbox_color=(72, 101, 241),
+                         text_color=(72, 101, 241),
+                         mask_color=None,
+                         thickness=2,
+                         font_size=13,
+                         win_name='',
+                         show=False,
+                         wait_time=0,
+                         out_file=None):
+        """Draw `panoptic result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (dict): The results.
+
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+               The tuple of color should be in BGR order. Default: 'green'.
+            text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+               The tuple of color should be in BGR order. Default: 'green'.
+            mask_color (None or str or tuple(int) or :obj:`Color`):
+               Color of masks. The tuple of color should be in BGR order.
+               Default: None.
+            thickness (int): Thickness of lines. Default: 2.
+            font_size (int): Font size of texts. Default: 13.
+            win_name (str): The window name. Default: ''.
+            wait_time (float): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        pan_results = result['pan_results']
+        # keep objects ahead
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != self.num_classes  # for VOID label
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        img = imshow_det_bboxes(
+            img,
+            segms=segms,
+            labels=labels,
+            class_names=self.CLASSES,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img
diff --git a/mmdet/models/detectors/nasfcos.py b/mmdet/models/detectors/nasfcos.py
new file mode 100755
index 0000000..a34c228
--- /dev/null
+++ b/mmdet/models/detectors/nasfcos.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class NASFCOS(SingleStageDetector):
+    """NAS-FCOS: Fast Neural Architecture Search for Object Detection.
+
+    https://arxiv.org/abs/1906.0442
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(NASFCOS, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                      test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/paa.py b/mmdet/models/detectors/paa.py
new file mode 100755
index 0000000..f5cb837
--- /dev/null
+++ b/mmdet/models/detectors/paa.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class PAA(SingleStageDetector):
+    """Implementation of `PAA <https://arxiv.org/pdf/2007.08103.pdf>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PAA, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                  test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/panoptic_fpn.py b/mmdet/models/detectors/panoptic_fpn.py
new file mode 100755
index 0000000..f8ac751
--- /dev/null
+++ b/mmdet/models/detectors/panoptic_fpn.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+
+
+@DETECTORS.register_module()
+class PanopticFPN(TwoStagePanopticSegmentor):
+    r"""Implementation of `Panoptic feature pyramid
+    networks <https://arxiv.org/pdf/1901.02446>`_"""
+
+    def __init__(
+            self,
+            backbone,
+            neck=None,
+            rpn_head=None,
+            roi_head=None,
+            train_cfg=None,
+            test_cfg=None,
+            pretrained=None,
+            init_cfg=None,
+            # for panoptic segmentation
+            semantic_head=None,
+            panoptic_fusion_head=None):
+        super(PanopticFPN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg,
+            semantic_head=semantic_head,
+            panoptic_fusion_head=panoptic_fusion_head)
diff --git a/mmdet/models/detectors/panoptic_two_stage_segmentor.py b/mmdet/models/detectors/panoptic_two_stage_segmentor.py
new file mode 100755
index 0000000..5ad49ba
--- /dev/null
+++ b/mmdet/models/detectors/panoptic_two_stage_segmentor.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import torch
+
+from mmdet.core import INSTANCE_OFFSET, bbox2roi, multiclass_nms
+from mmdet.core.visualization import imshow_det_bboxes
+from ..builder import DETECTORS, build_head
+from ..roi_heads.mask_heads.fcn_mask_head import _do_paste_mask
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class TwoStagePanopticSegmentor(TwoStageDetector):
+    """Base class of Two-stage Panoptic Segmentor.
+
+    As well as the components in TwoStageDetector, Panoptic Segmentor has extra
+    semantic_head and panoptic_fusion_head.
+    """
+
+    def __init__(
+            self,
+            backbone,
+            neck=None,
+            rpn_head=None,
+            roi_head=None,
+            train_cfg=None,
+            test_cfg=None,
+            pretrained=None,
+            init_cfg=None,
+            # for panoptic segmentation
+            semantic_head=None,
+            panoptic_fusion_head=None):
+        super(TwoStagePanopticSegmentor,
+              self).__init__(backbone, neck, rpn_head, roi_head, train_cfg,
+                             test_cfg, pretrained, init_cfg)
+        if semantic_head is not None:
+            self.semantic_head = build_head(semantic_head)
+        if panoptic_fusion_head is not None:
+            panoptic_cfg = test_cfg.panoptic if test_cfg is not None else None
+            panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+            panoptic_fusion_head_.update(test_cfg=panoptic_cfg)
+            self.panoptic_fusion_head = build_head(panoptic_fusion_head_)
+
+            self.num_things_classes = self.panoptic_fusion_head.\
+                num_things_classes
+            self.num_stuff_classes = self.panoptic_fusion_head.\
+                num_stuff_classes
+            self.num_classes = self.panoptic_fusion_head.num_classes
+
+    @property
+    def with_semantic_head(self):
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_panoptic_fusion_head(self):
+        return hasattr(self, 'panoptic_fusion_heads') and \
+               self.panoptic_fusion_head is not None
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/get_flops.py`
+        """
+        raise NotImplementedError(
+            f'`forward_dummy` is not implemented in {self.__class__.__name__}')
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      gt_semantic_seg=None,
+                      proposals=None,
+                      **kwargs):
+        x = self.extract_feat(img)
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_losses, proposal_list = self.rpn_head.forward_train(
+                x,
+                img_metas,
+                gt_bboxes,
+                gt_labels=None,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                proposal_cfg=proposal_cfg)
+            losses.update(rpn_losses)
+        else:
+            proposal_list = proposals
+
+        roi_losses = self.roi_head.forward_train(x, img_metas, proposal_list,
+                                                 gt_bboxes, gt_labels,
+                                                 gt_bboxes_ignore, gt_masks,
+                                                 **kwargs)
+        losses.update(roi_losses)
+
+        semantic_loss = self.semantic_head.forward_train(x, gt_semantic_seg)
+        losses.update(semantic_loss)
+
+        return losses
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Simple test for mask head without augmentation."""
+        img_shapes = tuple(meta['ori_shape']
+                           for meta in img_metas) if rescale else tuple(
+                               meta['pad_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            masks = []
+            for img_shape in img_shapes:
+                out_shape = (0, self.roi_head.bbox_head.num_classes) \
+                            + img_shape[:2]
+                masks.append(det_bboxes[0].new_zeros(out_shape))
+            mask_pred = det_bboxes[0].new_zeros((0, 80, 28, 28))
+            mask_results = dict(
+                masks=masks, mask_pred=mask_pred, mask_feats=None)
+            return mask_results
+
+        _bboxes = [det_bboxes[i][:, :4] for i in range(len(det_bboxes))]
+        if rescale:
+            if not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    det_bboxes[0].new_tensor(scale_factor)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                _bboxes[i] * scale_factors[i] for i in range(len(_bboxes))
+            ]
+
+        mask_rois = bbox2roi(_bboxes)
+        mask_results = self.roi_head._mask_forward(x, mask_rois)
+        mask_pred = mask_results['mask_pred']
+        # split batch mask prediction back to each image
+        num_mask_roi_per_img = [len(det_bbox) for det_bbox in det_bboxes]
+        mask_preds = mask_pred.split(num_mask_roi_per_img, 0)
+
+        # resize the mask_preds to (K, H, W)
+        masks = []
+        for i in range(len(_bboxes)):
+            det_bbox = det_bboxes[i][:, :4]
+            det_label = det_labels[i]
+
+            mask_pred = mask_preds[i].sigmoid()
+
+            box_inds = torch.arange(mask_pred.shape[0])
+            mask_pred = mask_pred[box_inds, det_label][:, None]
+
+            img_h, img_w, _ = img_shapes[i]
+            mask_pred, _ = _do_paste_mask(
+                mask_pred, det_bbox, img_h, img_w, skip_empty=False)
+            masks.append(mask_pred)
+
+        mask_results['masks'] = masks
+
+        return mask_results
+
+    def simple_test(self, img, img_metas, proposals=None, rescale=False):
+        """Test without Augmentation."""
+        x = self.extract_feat(img)
+
+        if proposals is None:
+            proposal_list = self.rpn_head.simple_test_rpn(x, img_metas)
+        else:
+            proposal_list = proposals
+
+        bboxes, scores = self.roi_head.simple_test_bboxes(
+            x, img_metas, proposal_list, None, rescale=rescale)
+
+        pan_cfg = self.test_cfg.panoptic
+        # class-wise predictions
+        det_bboxes = []
+        det_labels = []
+        for bboxe, score in zip(bboxes, scores):
+            det_bbox, det_label = multiclass_nms(bboxe, score,
+                                                 pan_cfg.score_thr,
+                                                 pan_cfg.nms,
+                                                 pan_cfg.max_per_img)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+
+        mask_results = self.simple_test_mask(
+            x, img_metas, det_bboxes, det_labels, rescale=rescale)
+        masks = mask_results['masks']
+
+        seg_preds = self.semantic_head.simple_test(x, img_metas, rescale)
+
+        results = []
+        for i in range(len(det_bboxes)):
+            pan_results = self.panoptic_fusion_head.simple_test(
+                det_bboxes[i], det_labels[i], masks[i], seg_preds[i])
+            pan_results = pan_results.int().detach().cpu().numpy()
+            result = dict(pan_results=pan_results)
+            results.append(result)
+        return results
+
+    def show_result(self,
+                    img,
+                    result,
+                    score_thr=0.3,
+                    bbox_color=(72, 101, 241),
+                    text_color=(72, 101, 241),
+                    mask_color=None,
+                    thickness=2,
+                    font_size=13,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (dict): The results.
+
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+               The tuple of color should be in BGR order. Default: 'green'.
+            text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+               The tuple of color should be in BGR order. Default: 'green'.
+            mask_color (None or str or tuple(int) or :obj:`Color`):
+               Color of masks. The tuple of color should be in BGR order.
+               Default: None.
+            thickness (int): Thickness of lines. Default: 2.
+            font_size (int): Font size of texts. Default: 13.
+            win_name (str): The window name. Default: ''.
+            wait_time (float): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        pan_results = result['pan_results']
+        # keep objects ahead
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != self.num_classes  # for VOID label
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        img = imshow_det_bboxes(
+            img,
+            segms=segms,
+            labels=labels,
+            class_names=self.CLASSES,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img
diff --git a/mmdet/models/detectors/point_rend.py b/mmdet/models/detectors/point_rend.py
new file mode 100755
index 0000000..90eb4d4
--- /dev/null
+++ b/mmdet/models/detectors/point_rend.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class PointRend(TwoStageDetector):
+    """PointRend: Image Segmentation as Rendering
+
+    This detector is the implementation of
+    `PointRend <https://arxiv.org/abs/1912.08193>`_.
+
+    """
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PointRend, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
diff --git a/mmdet/models/detectors/queryinst.py b/mmdet/models/detectors/queryinst.py
new file mode 100755
index 0000000..5fc216c
--- /dev/null
+++ b/mmdet/models/detectors/queryinst.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .sparse_rcnn import SparseRCNN
+
+
+@DETECTORS.register_module()
+class QueryInst(SparseRCNN):
+    r"""Implementation of
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_"""
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(QueryInst, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
diff --git a/mmdet/models/detectors/reppoints_detector.py b/mmdet/models/detectors/reppoints_detector.py
new file mode 100755
index 0000000..f1986cd
--- /dev/null
+++ b/mmdet/models/detectors/reppoints_detector.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class RepPointsDetector(SingleStageDetector):
+    """RepPoints: Point Set Representation for Object Detection.
+
+        This detector is the implementation of:
+        - RepPoints detector (https://arxiv.org/pdf/1904.11490)
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(RepPointsDetector,
+              self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg,
+                             pretrained, init_cfg)
diff --git a/mmdet/models/detectors/retinanet.py b/mmdet/models/detectors/retinanet.py
new file mode 100755
index 0000000..c28545a
--- /dev/null
+++ b/mmdet/models/detectors/retinanet.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class RetinaNet(SingleStageDetector):
+    """Implementation of `RetinaNet <https://arxiv.org/abs/1708.02002>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(RetinaNet, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                        test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/rpn.py b/mmdet/models/detectors/rpn.py
new file mode 100755
index 0000000..707e02b
--- /dev/null
+++ b/mmdet/models/detectors/rpn.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from inspect import signature
+
+import mmcv
+import torch
+from mmcv.image import tensor2imgs
+
+from mmdet.core import bbox_mapping
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector
+
+
+@DETECTORS.register_module()
+class RPN(BaseDetector):
+    """Implementation of Region Proposal Network."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None,
+                 init_cfg=None):
+        super(RPN, self).__init__(init_cfg)
+        if pretrained:
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            backbone.pretrained = pretrained
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck) if neck is not None else None
+        rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+        rpn_head.update(train_cfg=rpn_train_cfg)
+        rpn_head.update(test_cfg=test_cfg.rpn)
+        self.rpn_head = build_head(rpn_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, img):
+        """Extract features.
+
+        Args:
+            img (torch.Tensor): Image tensor with shape (n, c, h ,w).
+
+        Returns:
+            list[torch.Tensor]: Multi-level features that may have
+                different resolutions.
+        """
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        x = self.extract_feat(img)
+        rpn_outs = self.rpn_head(x)
+        return rpn_outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes=None,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        if (isinstance(self.train_cfg.rpn, dict)
+                and self.train_cfg.rpn.get('debug', False)):
+            self.rpn_head.debug_imgs = tensor2imgs(img)
+
+        x = self.extract_feat(img)
+        losses = self.rpn_head.forward_train(x, img_metas, gt_bboxes, None,
+                                             gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[np.ndarray]: proposals
+        """
+        x = self.extract_feat(img)
+        # get origin input shape to onnx dynamic input shape
+        if torch.onnx.is_in_onnx_export():
+            img_shape = torch._shape_as_tensor(img)[2:]
+            img_metas[0]['img_shape_for_onnx'] = img_shape
+        proposal_list = self.rpn_head.simple_test_rpn(x, img_metas)
+        if rescale:
+            for proposals, meta in zip(proposal_list, img_metas):
+                proposals[:, :4] /= proposals.new_tensor(meta['scale_factor'])
+        if torch.onnx.is_in_onnx_export():
+            return proposal_list
+
+        return [proposal.cpu().numpy() for proposal in proposal_list]
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[np.ndarray]: proposals
+        """
+        proposal_list = self.rpn_head.aug_test_rpn(
+            self.extract_feats(imgs), img_metas)
+        if not rescale:
+            for proposals, img_meta in zip(proposal_list, img_metas[0]):
+                img_shape = img_meta['img_shape']
+                scale_factor = img_meta['scale_factor']
+                flip = img_meta['flip']
+                flip_direction = img_meta['flip_direction']
+                proposals[:, :4] = bbox_mapping(proposals[:, :4], img_shape,
+                                                scale_factor, flip,
+                                                flip_direction)
+        return [proposal.cpu().numpy() for proposal in proposal_list]
+
+    def show_result(self, data, result, top_k=20, **kwargs):
+        """Show RPN proposals on the image.
+
+        Args:
+            data (str or np.ndarray): Image filename or loaded image.
+            result (Tensor or tuple): The results to draw over `img`
+                bbox_result or (bbox_result, segm_result).
+            top_k (int): Plot the first k bboxes only
+               if set positive. Default: 20
+
+        Returns:
+            np.ndarray: The image with bboxes drawn on it.
+        """
+        if kwargs is not None:
+            kwargs['colors'] = 'green'
+            sig = signature(mmcv.imshow_bboxes)
+            for k in list(kwargs.keys()):
+                if k not in sig.parameters:
+                    kwargs.pop(k)
+        mmcv.imshow_bboxes(data, result, top_k=top_k, **kwargs)
diff --git a/mmdet/models/detectors/scnet.py b/mmdet/models/detectors/scnet.py
new file mode 100755
index 0000000..a361d81
--- /dev/null
+++ b/mmdet/models/detectors/scnet.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .cascade_rcnn import CascadeRCNN
+
+
+@DETECTORS.register_module()
+class SCNet(CascadeRCNN):
+    """Implementation of `SCNet <https://arxiv.org/abs/2012.10150>`_"""
+
+    def __init__(self, **kwargs):
+        super(SCNet, self).__init__(**kwargs)
diff --git a/mmdet/models/detectors/single_stage.py b/mmdet/models/detectors/single_stage.py
new file mode 100755
index 0000000..c375c72
--- /dev/null
+++ b/mmdet/models/detectors/single_stage.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+
+from mmdet.core import bbox2result
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector
+
+
+@DETECTORS.register_module()
+class SingleStageDetector(BaseDetector):
+    """Base class for single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(SingleStageDetector, self).__init__(init_cfg)
+        if pretrained:
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            backbone.pretrained = pretrained
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = build_head(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            img (torch.Tensor): Images with shape (N, C, H, W).
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        feat = self.extract_feat(img)
+        results_list = self.bbox_head.simple_test(
+            feat, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in results_list
+        ]
+        return bbox_results
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            imgs (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        assert hasattr(self.bbox_head, 'aug_test'), \
+            f'{self.bbox_head.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        feats = self.extract_feats(imgs)
+        results_list = self.bbox_head.aug_test(
+            feats, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in results_list
+        ]
+        return bbox_results
+
+    def onnx_export(self, img, img_metas, with_nms=True):
+        """Test function without test time augmentation.
+
+        Args:
+            img (torch.Tensor): input images.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        # get origin input shape to support onnx dynamic shape
+
+        # get shape as tensor
+        img_shape = torch._shape_as_tensor(img)[2:]
+        img_metas[0]['img_shape_for_onnx'] = img_shape
+        # get pad input shape to support onnx dynamic shape for exporting
+        # `CornerNet` and `CentripetalNet`, which 'pad_shape' is used
+        # for inference
+        img_metas[0]['pad_shape_for_onnx'] = img_shape
+
+        if len(outs) == 2:
+            # add dummy score_factor
+            outs = (*outs, None)
+        # TODO Can we change to `get_bboxes` when `onnx_export` fail
+        det_bboxes, det_labels = self.bbox_head.onnx_export(
+            *outs, img_metas, with_nms=with_nms)
+
+        return det_bboxes, det_labels
diff --git a/mmdet/models/detectors/single_stage_instance_seg.py b/mmdet/models/detectors/single_stage_instance_seg.py
new file mode 100755
index 0000000..239b669
--- /dev/null
+++ b/mmdet/models/detectors/single_stage_instance_seg.py
@@ -0,0 +1,363 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+
+from mmdet.core.visualization.image import imshow_det_bboxes
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector
+
+INF = 1e8
+
+
+@DETECTORS.register_module()
+class SingleStageInstanceSegmentor(BaseDetector):
+    """Base class for single-stage instance segmentors."""
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+
+        if pretrained:
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            backbone.pretrained = pretrained
+        super(SingleStageInstanceSegmentor, self).__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        else:
+            self.neck = None
+        if bbox_head is not None:
+            bbox_head.update(train_cfg=copy.deepcopy(train_cfg))
+            bbox_head.update(test_cfg=copy.deepcopy(test_cfg))
+            self.bbox_head = build_head(bbox_head)
+        else:
+            self.bbox_head = None
+
+        assert mask_head, f'`mask_head` must ' \
+                          f'be implemented in {self.__class__.__name__}'
+        mask_head.update(train_cfg=copy.deepcopy(train_cfg))
+        mask_head.update(test_cfg=copy.deepcopy(test_cfg))
+        self.mask_head = build_head(mask_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone and neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        raise NotImplementedError(
+            f'`forward_dummy` is not implemented in {self.__class__.__name__}')
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_masks,
+                      gt_labels,
+                      gt_bboxes=None,
+                      gt_bboxes_ignore=None,
+                      **kwargs):
+        """
+        Args:
+            img (Tensor): Input images of shape (B, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_masks (list[:obj:`BitmapMasks`] | None) : The segmentation
+                masks for each box.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes (list[Tensor]): Each item is the truth boxes
+                of each image in [tl_x, tl_y, br_x, br_y] format.
+                Default: None.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        gt_masks = [
+            gt_mask.to_tensor(dtype=torch.bool, device=img.device)
+            for gt_mask in gt_masks
+        ]
+        x = self.extract_feat(img)
+        losses = dict()
+
+        # CondInst and YOLACT have bbox_head
+        if self.bbox_head:
+            # bbox_head_preds is a tuple
+            bbox_head_preds = self.bbox_head(x)
+            # positive_infos is a list of obj:`InstanceData`
+            # It contains the information about the positive samples
+            # CondInst, YOLACT
+            det_losses, positive_infos = self.bbox_head.loss(
+                *bbox_head_preds,
+                gt_bboxes=gt_bboxes,
+                gt_labels=gt_labels,
+                gt_masks=gt_masks,
+                img_metas=img_metas,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                **kwargs)
+            losses.update(det_losses)
+        else:
+            positive_infos = None
+
+        mask_loss = self.mask_head.forward_train(
+            x,
+            gt_labels,
+            gt_masks,
+            img_metas,
+            positive_infos=positive_infos,
+            gt_bboxes=gt_bboxes,
+            gt_bboxes_ignore=gt_bboxes_ignore,
+            **kwargs)
+        # avoid loss override
+        assert not set(mask_loss.keys()) & set(losses.keys())
+
+        losses.update(mask_loss)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            img (torch.Tensor): Images with shape (B, C, H, W).
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list(tuple): Formatted bbox and mask results of multiple \
+                images. The outer list corresponds to each image. \
+                Each tuple contains two type of results of single image:
+
+                - bbox_results (list[np.ndarray]): BBox results of
+                  single image. The list corresponds to each class.
+                  each ndarray has a shape (N, 5), N is the number of
+                  bboxes with this category, and last dimension
+                  5 arrange as (x1, y1, x2, y2, scores).
+                - mask_results (list[np.ndarray]): Mask results of
+                  single image. The list corresponds to each class.
+                  each ndarray has a shape (N, img_h, img_w), N
+                  is the number of masks with this category.
+        """
+        feat = self.extract_feat(img)
+        if self.bbox_head:
+            outs = self.bbox_head(feat)
+            # results_list is list[obj:`InstanceData`]
+            results_list = self.bbox_head.get_results(
+                *outs, img_metas=img_metas, cfg=self.test_cfg, rescale=rescale)
+        else:
+            results_list = None
+
+        results_list = self.mask_head.simple_test(
+            feat, img_metas, rescale=rescale, instances_list=results_list)
+
+        format_results_list = []
+        for results in results_list:
+            format_results_list.append(self.format_results(results))
+
+        return format_results_list
+
+    def format_results(self, results):
+        """Format the model predictions according to the interface with
+        dataset.
+
+        Args:
+            results (:obj:`InstanceData`): Processed
+                results of single images. Usually contains
+                following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,)
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+
+        Returns:
+            tuple: Formatted bbox and mask results.. It contains two items:
+
+                - bbox_results (list[np.ndarray]): BBox results of
+                  single image. The list corresponds to each class.
+                  each ndarray has a shape (N, 5), N is the number of
+                  bboxes with this category, and last dimension
+                  5 arrange as (x1, y1, x2, y2, scores).
+                - mask_results (list[np.ndarray]): Mask results of
+                  single image. The list corresponds to each class.
+                  each ndarray has shape (N, img_h, img_w), N
+                  is the number of masks with this category.
+        """
+        data_keys = results.keys()
+        assert 'scores' in data_keys
+        assert 'labels' in data_keys
+
+        assert 'masks' in data_keys, \
+            'results should contain ' \
+            'masks when format the results '
+        mask_results = [[] for _ in range(self.mask_head.num_classes)]
+
+        num_masks = len(results)
+
+        if num_masks == 0:
+            bbox_results = [
+                np.zeros((0, 5), dtype=np.float32)
+                for _ in range(self.mask_head.num_classes)
+            ]
+            return bbox_results, mask_results
+
+        labels = results.labels.detach().cpu().numpy()
+
+        if 'bboxes' not in results:
+            # create dummy bbox results to store the scores
+            results.bboxes = results.scores.new_zeros(len(results), 4)
+
+        det_bboxes = torch.cat([results.bboxes, results.scores[:, None]],
+                               dim=-1)
+        det_bboxes = det_bboxes.detach().cpu().numpy()
+        bbox_results = [
+            det_bboxes[labels == i, :]
+            for i in range(self.mask_head.num_classes)
+        ]
+
+        masks = results.masks.detach().cpu().numpy()
+
+        for idx in range(num_masks):
+            mask = masks[idx]
+            mask_results[labels[idx]].append(mask)
+
+        return bbox_results, mask_results
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        raise NotImplementedError
+
+    def show_result(self,
+                    img,
+                    result,
+                    score_thr=0.3,
+                    bbox_color=(72, 101, 241),
+                    text_color=(72, 101, 241),
+                    mask_color=None,
+                    thickness=2,
+                    font_size=13,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (tuple): Format bbox and mask results.
+                It contains two items:
+
+                - bbox_results (list[np.ndarray]): BBox results of
+                  single image. The list corresponds to each class.
+                  each ndarray has a shape (N, 5), N is the number of
+                  bboxes with this category, and last dimension
+                  5 arrange as (x1, y1, x2, y2, scores).
+                - mask_results (list[np.ndarray]): Mask results of
+                  single image. The list corresponds to each class.
+                  each ndarray has shape (N, img_h, img_w), N
+                  is the number of masks with this category.
+
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+               The tuple of color should be in BGR order. Default: 'green'
+            text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+               The tuple of color should be in BGR order. Default: 'green'
+            mask_color (None or str or tuple(int) or :obj:`Color`):
+               Color of masks. The tuple of color should be in BGR order.
+               Default: None
+            thickness (int): Thickness of lines. Default: 2
+            font_size (int): Font size of texts. Default: 13
+            win_name (str): The window name. Default: ''
+            wait_time (float): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+
+        assert isinstance(result, tuple)
+        bbox_result, mask_result = result
+        bboxes = np.vstack(bbox_result)
+        img = mmcv.imread(img)
+        img = img.copy()
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_result)
+        ]
+        labels = np.concatenate(labels)
+        if len(labels) == 0:
+            bboxes = np.zeros([0, 5])
+            masks = np.zeros([0, 0, 0])
+        # draw segmentation masks
+        else:
+            masks = mmcv.concat_list(mask_result)
+
+            if isinstance(masks[0], torch.Tensor):
+                masks = torch.stack(masks, dim=0).detach().cpu().numpy()
+            else:
+                masks = np.stack(masks, axis=0)
+            # dummy bboxes
+            if bboxes[:, :4].sum() == 0:
+                num_masks = len(bboxes)
+                x_any = masks.any(axis=1)
+                y_any = masks.any(axis=2)
+                for idx in range(num_masks):
+                    x = np.where(x_any[idx, :])[0]
+                    y = np.where(y_any[idx, :])[0]
+                    if len(x) > 0 and len(y) > 0:
+                        bboxes[idx, :4] = np.array(
+                            [x[0], y[0], x[-1] + 1, y[-1] + 1],
+                            dtype=np.float32)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        img = imshow_det_bboxes(
+            img,
+            bboxes,
+            labels,
+            masks,
+            class_names=self.CLASSES,
+            score_thr=score_thr,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img
diff --git a/mmdet/models/detectors/solo.py b/mmdet/models/detectors/solo.py
new file mode 100755
index 0000000..df6f6de
--- /dev/null
+++ b/mmdet/models/detectors/solo.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@DETECTORS.register_module()
+class SOLO(SingleStageInstanceSegmentor):
+    """`SOLO: Segmenting Objects by Locations
+    <https://arxiv.org/abs/1912.04488>`_
+
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            pretrained=pretrained)
diff --git a/mmdet/models/detectors/solov2.py b/mmdet/models/detectors/solov2.py
new file mode 100755
index 0000000..711fcb4
--- /dev/null
+++ b/mmdet/models/detectors/solov2.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@DETECTORS.register_module()
+class SOLOv2(SingleStageInstanceSegmentor):
+    """`SOLOv2: Dynamic and Fast Instance Segmentation
+    <https://arxiv.org/abs/2003.10152>`_
+
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            pretrained=pretrained)
diff --git a/mmdet/models/detectors/sparse_rcnn.py b/mmdet/models/detectors/sparse_rcnn.py
new file mode 100755
index 0000000..e90c2a5
--- /dev/null
+++ b/mmdet/models/detectors/sparse_rcnn.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class SparseRCNN(TwoStageDetector):
+    r"""Implementation of `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_"""
+
+    def __init__(self, *args, **kwargs):
+        super(SparseRCNN, self).__init__(*args, **kwargs)
+        assert self.with_rpn, 'Sparse R-CNN and QueryInst ' \
+            'do not support external proposals'
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      proposals=None,
+                      **kwargs):
+        """Forward function of SparseR-CNN and QueryInst in train stage.
+
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (List[Tensor], optional) : Segmentation masks for
+                each box. This is required to train QueryInst.
+            proposals (List[Tensor], optional): override rpn proposals with
+                custom proposals. Use when `with_rpn` is False.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        assert proposals is None, 'Sparse R-CNN and QueryInst ' \
+            'do not support external proposals'
+
+        x = self.extract_feat(img)
+        proposal_boxes, proposal_features, imgs_whwh = \
+            self.rpn_head.forward_train(x, img_metas)
+        roi_losses = self.roi_head.forward_train(
+            x,
+            proposal_boxes,
+            proposal_features,
+            img_metas,
+            gt_bboxes,
+            gt_labels,
+            gt_bboxes_ignore=gt_bboxes_ignore,
+            gt_masks=gt_masks,
+            imgs_whwh=imgs_whwh)
+        return roi_losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        x = self.extract_feat(img)
+        proposal_boxes, proposal_features, imgs_whwh = \
+            self.rpn_head.simple_test_rpn(x, img_metas)
+        results = self.roi_head.simple_test(
+            x,
+            proposal_boxes,
+            proposal_features,
+            img_metas,
+            imgs_whwh=imgs_whwh,
+            rescale=rescale)
+        return results
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        # backbone
+        x = self.extract_feat(img)
+        # rpn
+        num_imgs = len(img)
+        dummy_img_metas = [
+            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
+        ]
+        proposal_boxes, proposal_features, imgs_whwh = \
+            self.rpn_head.simple_test_rpn(x, dummy_img_metas)
+        # roi_head
+        roi_outs = self.roi_head.forward_dummy(x, proposal_boxes,
+                                               proposal_features,
+                                               dummy_img_metas)
+        return roi_outs
diff --git a/mmdet/models/detectors/tood.py b/mmdet/models/detectors/tood.py
new file mode 100755
index 0000000..7dd18c3
--- /dev/null
+++ b/mmdet/models/detectors/tood.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class TOOD(SingleStageDetector):
+    r"""Implementation of `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(TOOD, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained, init_cfg)
+
+    def set_epoch(self, epoch):
+        self.bbox_head.epoch = epoch
diff --git a/mmdet/models/detectors/trident_faster_rcnn.py b/mmdet/models/detectors/trident_faster_rcnn.py
new file mode 100755
index 0000000..fb26168
--- /dev/null
+++ b/mmdet/models/detectors/trident_faster_rcnn.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .faster_rcnn import FasterRCNN
+
+
+@DETECTORS.register_module()
+class TridentFasterRCNN(FasterRCNN):
+    """Implementation of `TridentNet <https://arxiv.org/abs/1901.01892>`_"""
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None,
+                 init_cfg=None):
+
+        super(TridentFasterRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+        assert self.backbone.num_branch == self.roi_head.num_branch
+        assert self.backbone.test_branch_idx == self.roi_head.test_branch_idx
+        self.num_branch = self.backbone.num_branch
+        self.test_branch_idx = self.backbone.test_branch_idx
+
+    def simple_test(self, img, img_metas, proposals=None, rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(img)
+        if proposals is None:
+            num_branch = (self.num_branch if self.test_branch_idx == -1 else 1)
+            trident_img_metas = img_metas * num_branch
+            proposal_list = self.rpn_head.simple_test_rpn(x, trident_img_metas)
+        else:
+            proposal_list = proposals
+        # TODO： Fix trident_img_metas undefined errors
+        #  when proposals is specified
+        return self.roi_head.simple_test(
+            x, proposal_list, trident_img_metas, rescale=rescale)
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        x = self.extract_feats(imgs)
+        num_branch = (self.num_branch if self.test_branch_idx == -1 else 1)
+        trident_img_metas = [img_metas * num_branch for img_metas in img_metas]
+        proposal_list = self.rpn_head.aug_test_rpn(x, trident_img_metas)
+        return self.roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def forward_train(self, img, img_metas, gt_bboxes, gt_labels, **kwargs):
+        """make copies of img and gts to fit multi-branch."""
+        trident_gt_bboxes = tuple(gt_bboxes * self.num_branch)
+        trident_gt_labels = tuple(gt_labels * self.num_branch)
+        trident_img_metas = tuple(img_metas * self.num_branch)
+
+        return super(TridentFasterRCNN,
+                     self).forward_train(img, trident_img_metas,
+                                         trident_gt_bboxes, trident_gt_labels)
diff --git a/mmdet/models/detectors/two_stage.py b/mmdet/models/detectors/two_stage.py
new file mode 100755
index 0000000..870e2b8
--- /dev/null
+++ b/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector
+
+
+@DETECTORS.register_module()
+class TwoStageDetector(BaseDetector):
+    """Base class for two-stage detectors.
+
+    Two-stage detectors typically consisting of a region proposal network and a
+    task-specific regression head.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(TwoStageDetector, self).__init__(init_cfg)
+        if pretrained:
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            backbone.pretrained = pretrained
+        self.backbone = build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = build_neck(neck)
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            self.rpn_head = build_head(rpn_head_)
+
+        if roi_head is not None:
+            # update train and test cfg here for now
+            # TODO: refactor assigner & sampler
+            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
+            roi_head.update(train_cfg=rcnn_train_cfg)
+            roi_head.update(test_cfg=test_cfg.rcnn)
+            roi_head.pretrained = pretrained
+            self.roi_head = build_head(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    @property
+    def with_rpn(self):
+        """bool: whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self):
+        """bool: whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        outs = ()
+        # backbone
+        x = self.extract_feat(img)
+        # rpn
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(x)
+            outs = outs + (rpn_outs, )
+        proposals = torch.randn(1000, 4).to(img.device)
+        # roi_head
+        roi_outs = self.roi_head.forward_dummy(x, proposals)
+        outs = outs + (roi_outs, )
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      proposals=None,
+                      **kwargs):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+
+            gt_labels (list[Tensor]): class indices corresponding to each box
+
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+            proposals : override rpn proposals with custom proposals. Use when
+                `with_rpn` is False.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_losses, proposal_list = self.rpn_head.forward_train(
+                x,
+                img_metas,
+                gt_bboxes,
+                gt_labels=None,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                proposal_cfg=proposal_cfg,
+                **kwargs)
+            losses.update(rpn_losses)
+        else:
+            proposal_list = proposals
+
+        roi_losses = self.roi_head.forward_train(x, img_metas, proposal_list,
+                                                 gt_bboxes, gt_labels,
+                                                 gt_bboxes_ignore, gt_masks,
+                                                 **kwargs)
+        losses.update(roi_losses)
+
+        return losses
+
+    async def async_simple_test(self,
+                                img,
+                                img_meta,
+                                proposals=None,
+                                rescale=False):
+        """Async test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(img)
+
+        if proposals is None:
+            proposal_list = await self.rpn_head.async_simple_test_rpn(
+                x, img_meta)
+        else:
+            proposal_list = proposals
+
+        return await self.roi_head.async_simple_test(
+            x, proposal_list, img_meta, rescale=rescale)
+
+    def simple_test(self, img, img_metas, proposals=None, rescale=False):
+        """Test without augmentation."""
+
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(img)
+        if proposals is None:
+            proposal_list = self.rpn_head.simple_test_rpn(x, img_metas)
+        else:
+            proposal_list = proposals
+
+        return self.roi_head.simple_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        x = self.extract_feats(imgs)
+        proposal_list = self.rpn_head.aug_test_rpn(x, img_metas)
+        return self.roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def onnx_export(self, img, img_metas):
+
+        img_shape = torch._shape_as_tensor(img)[2:]
+        img_metas[0]['img_shape_for_onnx'] = img_shape
+        x = self.extract_feat(img)
+        proposals = self.rpn_head.onnx_export(x, img_metas)
+        if hasattr(self.roi_head, 'onnx_export'):
+            return self.roi_head.onnx_export(x, proposals, img_metas)
+        else:
+            raise NotImplementedError(
+                f'{self.__class__.__name__} can not '
+                f'be exported to ONNX. Please refer to the '
+                f'list of supported models,'
+                f'https://mmdetection.readthedocs.io/en/latest/tutorials/pytorch2onnx.html#list-of-supported-models-exportable-to-onnx'  # noqa E501
+            )
diff --git a/mmdet/models/detectors/vfnet.py b/mmdet/models/detectors/vfnet.py
new file mode 100755
index 0000000..38ddcda
--- /dev/null
+++ b/mmdet/models/detectors/vfnet.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class VFNet(SingleStageDetector):
+    """Implementation of `VarifocalNet
+    (VFNet).<https://arxiv.org/abs/2008.13367>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(VFNet, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                    test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/yolact.py b/mmdet/models/detectors/yolact.py
new file mode 100755
index 0000000..4ddea0b
--- /dev/null
+++ b/mmdet/models/detectors/yolact.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import bbox2result
+from ..builder import DETECTORS, build_head
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class YOLACT(SingleStageDetector):
+    """Implementation of `YOLACT <https://arxiv.org/abs/1904.02689>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 segm_head,
+                 mask_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(YOLACT, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                     test_cfg, pretrained, init_cfg)
+        self.segm_head = build_head(segm_head)
+        self.mask_head = build_head(mask_head)
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        feat = self.extract_feat(img)
+        bbox_outs = self.bbox_head(feat)
+        prototypes = self.mask_head.forward_dummy(feat[0])
+        return (bbox_outs, prototypes)
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # convert Bitmap mask or Polygon Mask to Tensor here
+        gt_masks = [
+            gt_mask.to_tensor(dtype=torch.uint8, device=img.device)
+            for gt_mask in gt_masks
+        ]
+
+        x = self.extract_feat(img)
+
+        cls_score, bbox_pred, coeff_pred = self.bbox_head(x)
+        bbox_head_loss_inputs = (cls_score, bbox_pred) + (gt_bboxes, gt_labels,
+                                                          img_metas)
+        losses, sampling_results = self.bbox_head.loss(
+            *bbox_head_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+
+        segm_head_outs = self.segm_head(x[0])
+        loss_segm = self.segm_head.loss(segm_head_outs, gt_masks, gt_labels)
+        losses.update(loss_segm)
+
+        mask_pred = self.mask_head(x[0], coeff_pred, gt_bboxes, img_metas,
+                                   sampling_results)
+        loss_mask = self.mask_head.loss(mask_pred, gt_masks, gt_bboxes,
+                                        img_metas, sampling_results)
+        losses.update(loss_mask)
+
+        # check NaN and Inf
+        for loss_name in losses.keys():
+            assert torch.isfinite(torch.stack(losses[loss_name]))\
+                .all().item(), '{} becomes infinite or NaN!'\
+                .format(loss_name)
+
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test-time augmentation."""
+        feat = self.extract_feat(img)
+        det_bboxes, det_labels, det_coeffs = self.bbox_head.simple_test(
+            feat, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox2result(det_bbox, det_label, self.bbox_head.num_classes)
+            for det_bbox, det_label in zip(det_bboxes, det_labels)
+        ]
+
+        segm_results = self.mask_head.simple_test(
+            feat,
+            det_bboxes,
+            det_labels,
+            det_coeffs,
+            img_metas,
+            rescale=rescale)
+
+        return list(zip(bbox_results, segm_results))
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations."""
+        raise NotImplementedError(
+            'YOLACT does not support test-time augmentation')
diff --git a/mmdet/models/detectors/yolo.py b/mmdet/models/detectors/yolo.py
new file mode 100755
index 0000000..0ccd417
--- /dev/null
+++ b/mmdet/models/detectors/yolo.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+import torch
+
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class YOLOV3(SingleStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(YOLOV3, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                     test_cfg, pretrained, init_cfg)
+
+    def onnx_export(self, img, img_metas):
+        """Test function for exporting to ONNX, without test time augmentation.
+
+        Args:
+            img (torch.Tensor): input images.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head.forward(x)
+        # get shape as tensor
+        img_shape = torch._shape_as_tensor(img)[2:]
+        img_metas[0]['img_shape_for_onnx'] = img_shape
+
+        det_bboxes, det_labels = self.bbox_head.onnx_export(*outs, img_metas)
+
+        return det_bboxes, det_labels
diff --git a/mmdet/models/detectors/yolof.py b/mmdet/models/detectors/yolof.py
new file mode 100755
index 0000000..2bc4f1a
--- /dev/null
+++ b/mmdet/models/detectors/yolof.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class YOLOF(SingleStageDetector):
+    r"""Implementation of `You Only Look One-level Feature
+    <https://arxiv.org/abs/2103.09460>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(YOLOF, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                    test_cfg, pretrained, init_cfg)
diff --git a/mmdet/models/detectors/yolox.py b/mmdet/models/detectors/yolox.py
new file mode 100755
index 0000000..34d51b1
--- /dev/null
+++ b/mmdet/models/detectors/yolox.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from mmcv.runner import get_dist_info
+
+from ...utils import log_img_scale
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class YOLOX(SingleStageDetector):
+    r"""Implementation of `YOLOX: Exceeding YOLO Series in 2021
+    <https://arxiv.org/abs/2107.08430>`_
+
+    Note: Considering the trade-off between training speed and accuracy,
+    multi-scale training is temporarily kept. More elegant implementation
+    will be adopted in the future.
+
+    Args:
+        backbone (nn.Module): The backbone module.
+        neck (nn.Module): The neck module.
+        bbox_head (nn.Module): The bbox head module.
+        train_cfg (obj:`ConfigDict`, optional): The training config
+            of YOLOX. Default: None.
+        test_cfg (obj:`ConfigDict`, optional): The testing config
+            of YOLOX. Default: None.
+        pretrained (str, optional): model pretrained path.
+            Default: None.
+        input_size (tuple): The model default input image size. The shape
+            order should be (height, width). Default: (640, 640).
+        size_multiplier (int): Image size multiplication factor.
+            Default: 32.
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training. The real training image size will
+            be multiplied by size_multiplier. Default: (15, 25).
+        random_size_interval (int): The iter interval of change
+            image size. Default: 10.
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 input_size=(640, 640),
+                 size_multiplier=32,
+                 random_size_range=(15, 25),
+                 random_size_interval=10,
+                 init_cfg=None):
+        super(YOLOX, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                    test_cfg, pretrained, init_cfg)
+        log_img_scale(input_size, skip_square=True)
+        self.rank, self.world_size = get_dist_info()
+        self._default_input_size = input_size
+        self._input_size = input_size
+        self._random_size_range = random_size_range
+        self._random_size_interval = random_size_interval
+        self._size_multiplier = size_multiplier
+        self._progress_in_iter = 0
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # Multi-scale training
+        img, gt_bboxes = self._preprocess(img, gt_bboxes)
+
+        losses = super(YOLOX, self).forward_train(img, img_metas, gt_bboxes,
+                                                  gt_labels, gt_bboxes_ignore)
+
+        # random resizing
+        if (self._progress_in_iter + 1) % self._random_size_interval == 0:
+            self._input_size = self._random_resize(device=img.device)
+        self._progress_in_iter += 1
+
+        return losses
+
+    def _preprocess(self, img, gt_bboxes):
+        scale_y = self._input_size[0] / self._default_input_size[0]
+        scale_x = self._input_size[1] / self._default_input_size[1]
+        if scale_x != 1 or scale_y != 1:
+            img = F.interpolate(
+                img,
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            for gt_bbox in gt_bboxes:
+                gt_bbox[..., 0::2] = gt_bbox[..., 0::2] * scale_x
+                gt_bbox[..., 1::2] = gt_bbox[..., 1::2] * scale_y
+        return img, gt_bboxes
+
+    def _random_resize(self, device):
+        tensor = torch.LongTensor(2).to(device)
+
+        if self.rank == 0:
+            size = random.randint(*self._random_size_range)
+            aspect_ratio = float(
+                self._default_input_size[1]) / self._default_input_size[0]
+            size = (self._size_multiplier * size,
+                    self._size_multiplier * int(aspect_ratio * size))
+            tensor[0] = size[0]
+            tensor[1] = size[1]
+
+        if self.world_size > 1:
+            dist.barrier()
+            dist.broadcast(tensor, 0)
+
+        input_size = (tensor[0].item(), tensor[1].item())
+        return input_size
diff --git a/mmdet/models/losses/__init__.py b/mmdet/models/losses/__init__.py
new file mode 100755
index 0000000..068a54d
--- /dev/null
+++ b/mmdet/models/losses/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .accuracy import Accuracy, accuracy
+from .ae_loss import AssociativeEmbeddingLoss
+from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss
+from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
+                                 cross_entropy, mask_cross_entropy)
+from .dice_loss import DiceLoss
+from .focal_loss import FocalLoss, sigmoid_focal_loss
+from .gaussian_focal_loss import GaussianFocalLoss
+from .gfocal_loss import DistributionFocalLoss, QualityFocalLoss
+from .ghm_loss import GHMC, GHMR
+from .iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss, GIoULoss, IoULoss,
+                       bounded_iou_loss, iou_loss)
+from .kd_loss import KnowledgeDistillationKLDivLoss
+from .mse_loss import MSELoss, mse_loss
+from .pisa_loss import carl_loss, isr_p
+from .seesaw_loss import SeesawLoss
+from .smooth_l1_loss import L1Loss, SmoothL1Loss, l1_loss, smooth_l1_loss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+from .varifocal_loss import VarifocalLoss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+    'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+    'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+    'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss', 'GHMC',
+    'GHMR', 'reduce_loss', 'weight_reduce_loss', 'weighted_loss', 'L1Loss',
+    'l1_loss', 'isr_p', 'carl_loss', 'AssociativeEmbeddingLoss',
+    'GaussianFocalLoss', 'QualityFocalLoss', 'DistributionFocalLoss',
+    'VarifocalLoss', 'KnowledgeDistillationKLDivLoss', 'SeesawLoss', 'DiceLoss'
+]
diff --git a/mmdet/models/losses/accuracy.py b/mmdet/models/losses/accuracy.py
new file mode 100755
index 0000000..fe765a3
--- /dev/null
+++ b/mmdet/models/losses/accuracy.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+
+
+@mmcv.jit(coderize=True)
+def accuracy(pred, target, topk=1, thresh=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class)
+        target (torch.Tensor): The target of each prediction, shape (N, )
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == 2 and target.ndim == 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    pred_label = pred_label.t()  # transpose to shape (maxk, N)
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self, topk=(1, ), thresh=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh)
diff --git a/mmdet/models/losses/ae_loss.py b/mmdet/models/losses/ae_loss.py
new file mode 100755
index 0000000..5c6da22
--- /dev/null
+++ b/mmdet/models/losses/ae_loss.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@mmcv.jit(derivate=True, coderize=True)
+def ae_loss_per_image(tl_preds, br_preds, match):
+    """Associative Embedding Loss in one image.
+
+    Associative Embedding Loss including two parts: pull loss and push loss.
+    Pull loss makes embedding vectors from same object closer to each other.
+    Push loss distinguish embedding vector from different objects, and makes
+        the gap between them is large enough.
+
+    During computing, usually there are 3 cases:
+        - no object in image: both pull loss and push loss will be 0.
+        - one object in image: push loss will be 0 and pull loss is computed
+            by the two corner of the only object.
+        - more than one objects in image: pull loss is computed by corner pairs
+            from each object, push loss is computed by each object with all
+            other objects. We use confusion matrix with 0 in diagonal to
+            compute the push loss.
+
+    Args:
+        tl_preds (tensor): Embedding feature map of left-top corner.
+        br_preds (tensor): Embedding feature map of bottim-right corner.
+        match (list): Downsampled coordinates pair of each ground truth box.
+    """
+
+    tl_list, br_list, me_list = [], [], []
+    if len(match) == 0:  # no object in image
+        pull_loss = tl_preds.sum() * 0.
+        push_loss = tl_preds.sum() * 0.
+    else:
+        for m in match:
+            [tl_y, tl_x], [br_y, br_x] = m
+            tl_e = tl_preds[:, tl_y, tl_x].view(-1, 1)
+            br_e = br_preds[:, br_y, br_x].view(-1, 1)
+            tl_list.append(tl_e)
+            br_list.append(br_e)
+            me_list.append((tl_e + br_e) / 2.0)
+
+        tl_list = torch.cat(tl_list)
+        br_list = torch.cat(br_list)
+        me_list = torch.cat(me_list)
+
+        assert tl_list.size() == br_list.size()
+
+        # N is object number in image, M is dimension of embedding vector
+        N, M = tl_list.size()
+
+        pull_loss = (tl_list - me_list).pow(2) + (br_list - me_list).pow(2)
+        pull_loss = pull_loss.sum() / N
+
+        margin = 1  # exp setting of CornerNet, details in section 3.3 of paper
+
+        # confusion matrix of push loss
+        conf_mat = me_list.expand((N, N, M)).permute(1, 0, 2) - me_list
+        conf_weight = 1 - torch.eye(N).type_as(me_list)
+        conf_mat = conf_weight * (margin - conf_mat.sum(-1).abs())
+
+        if N > 1:  # more than one object in current image
+            push_loss = F.relu(conf_mat).sum() / (N * (N - 1))
+        else:
+            push_loss = tl_preds.sum() * 0.
+
+    return pull_loss, push_loss
+
+
+@LOSSES.register_module()
+class AssociativeEmbeddingLoss(nn.Module):
+    """Associative Embedding Loss.
+
+    More details can be found in
+    `Associative Embedding <https://arxiv.org/abs/1611.05424>`_ and
+    `CornerNet <https://arxiv.org/abs/1808.01244>`_ .
+    Code is modified from `kp_utils.py <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L180>`_  # noqa: E501
+
+    Args:
+        pull_weight (float): Loss weight for corners from same object.
+        push_weight (float): Loss weight for corners from different object.
+    """
+
+    def __init__(self, pull_weight=0.25, push_weight=0.25):
+        super(AssociativeEmbeddingLoss, self).__init__()
+        self.pull_weight = pull_weight
+        self.push_weight = push_weight
+
+    def forward(self, pred, target, match):
+        """Forward function."""
+        batch = pred.size(0)
+        pull_all, push_all = 0.0, 0.0
+        for i in range(batch):
+            pull, push = ae_loss_per_image(pred[i], target[i], match[i])
+
+            pull_all += self.pull_weight * pull
+            push_all += self.push_weight * push
+
+        return pull_all, push_all
diff --git a/mmdet/models/losses/balanced_l1_loss.py b/mmdet/models/losses/balanced_l1_loss.py
new file mode 100755
index 0000000..8500345
--- /dev/null
+++ b/mmdet/models/losses/balanced_l1_loss.py
@@ -0,0 +1,124 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def balanced_l1_loss(pred,
+                     target,
+                     beta=1.0,
+                     alpha=0.5,
+                     gamma=1.5,
+                     reduction='mean'):
+    """Calculate balanced L1 loss.
+
+    Please see the `Libra R-CNN <https://arxiv.org/pdf/1904.02701.pdf>`_
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 4).
+        target (torch.Tensor): The learning target of the prediction with
+            shape (N, 4).
+        beta (float): The loss is a piecewise function of prediction and target
+            and ``beta`` serves as a threshold for the difference between the
+            prediction and target. Defaults to 1.0.
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss.
+            Defaults to 1.5.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+
+    diff = torch.abs(pred - target)
+    b = np.e**(gamma / alpha) - 1
+    loss = torch.where(
+        diff < beta, alpha / b *
+        (b * diff + 1) * torch.log(b * diff / beta + 1) - alpha * diff,
+        gamma * diff + gamma / b - alpha * beta)
+
+    return loss
+
+
+@LOSSES.register_module()
+class BalancedL1Loss(nn.Module):
+    """Balanced L1 Loss.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Args:
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss. Defaults to 1.5.
+        beta (float, optional): The loss is a piecewise function of prediction
+            and target. ``beta`` serves as a threshold for the difference
+            between the prediction and target. Defaults to 1.0.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 alpha=0.5,
+                 gamma=1.5,
+                 beta=1.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(BalancedL1Loss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape (N, 4).
+            target (torch.Tensor): The learning target of the prediction with
+                shape (N, 4).
+            weight (torch.Tensor, optional): Sample-wise loss weight with
+                shape (N, ).
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * balanced_l1_loss(
+            pred,
+            target,
+            weight,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
diff --git a/mmdet/models/losses/cross_entropy_loss.py b/mmdet/models/losses/cross_entropy_loss.py
new file mode 100755
index 0000000..41411fc
--- /dev/null
+++ b/mmdet/models/losses/cross_entropy_loss.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  class_weight=None,
+                  ignore_index=-100,
+                  avg_non_ignore=False):
+    """Calculate the CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+    # element-wise losses
+    loss = F.cross_entropy(
+        pred,
+        label,
+        weight=class_weight,
+        reduction='none',
+        ignore_index=ignore_index)
+
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = label.numel() - (label == ignore_index).sum().item()
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(
+        valid_mask & (labels < label_channels), as_tuple=False)
+
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+
+    valid_mask = valid_mask.view(-1, 1).expand(labels.size(0),
+                                               label_channels).float()
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels)
+        bin_label_weights *= valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100,
+                         avg_non_ignore=False):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1) or (N, ).
+            When the shape of pred is (N, 1), label will be expanded to
+            one-hot format, and when the shape of pred is (N, ), label
+            will not be expanded to one-hot format.
+        label (torch.Tensor): The learning label of the prediction,
+            with shape (N, ).
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+
+    if pred.dim() != label.dim():
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.size(-1), ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            # The inplace writing method will have a mismatched broadcast
+            # shape error if the weight and valid_mask dimensions
+            # are inconsistent such as (B,N,1) and (B,N,C).
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+
+    # average loss over non-ignored elements
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = valid_mask.sum().item()
+
+    # weighted element-wise losses
+    weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None,
+                       **kwargs):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C, *), C is the
+            number of classes. The trailing * indicates arbitrary shape.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+
+    Example:
+        >>> N, C = 3, 11
+        >>> H, W = 2, 2
+        >>> pred = torch.randn(N, C, H, W) * 1000
+        >>> target = torch.rand(N, H, W)
+        >>> label = torch.randint(0, C, size=(N,))
+        >>> reduction = 'mean'
+        >>> avg_factor = None
+        >>> class_weights = None
+        >>> loss = mask_cross_entropy(pred, target, label, reduction,
+        >>>                           avg_factor, class_weights)
+        >>> assert loss.shape == (1,)
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@LOSSES.register_module()
+class CrossEntropyLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
+        """CrossEntropyLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
+        """
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The prediction.
+            label (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+            ignore_index (int | None): The label index to be ignored.
+                If not None, it will override the default value. Default: None.
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if ignore_index is None:
+            ignore_index = self.ignore_index
+
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(
+                self.class_weight, device=cls_score.device)
+        else:
+            class_weight = None
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            ignore_index=ignore_index,
+            avg_non_ignore=self.avg_non_ignore,
+            **kwargs)
+        return loss_cls
diff --git a/mmdet/models/losses/dice_loss.py b/mmdet/models/losses/dice_loss.py
new file mode 100755
index 0000000..585beea
--- /dev/null
+++ b/mmdet/models/losses/dice_loss.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+def dice_loss(pred,
+              target,
+              weight=None,
+              eps=1e-3,
+              reduction='mean',
+              naive_dice=False,
+              avg_factor=None):
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class DiceLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 activate=True,
+                 reduction='mean',
+                 naive_dice=False,
+                 loss_weight=1.0,
+                 eps=1e-3):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        """
+
+        super(DiceLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                reduction_override=None,
+                avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            else:
+                raise NotImplementedError
+
+        loss = self.loss_weight * dice_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            naive_dice=self.naive_dice,
+            avg_factor=avg_factor)
+
+        return loss
diff --git a/mmdet/models/losses/focal_loss.py b/mmdet/models/losses/focal_loss.py
new file mode 100755
index 0000000..2858c19
--- /dev/null
+++ b/mmdet/models/losses/focal_loss.py
@@ -0,0 +1,244 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def py_focal_loss_with_prob(pred,
+                            target,
+                            weight=None,
+                            gamma=2.0,
+                            alpha=0.25,
+                            reduction='mean',
+                            avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+    Different from `py_sigmoid_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): The prediction probability with shape (N, C),
+            C is the number of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    num_classes = pred.size(1)
+    target = F.one_hot(target, num_classes=num_classes + 1)
+    target = target[:, :num_classes]
+
+    target = target.type_as(pred)
+    pt = (1 - pred) * target + pred * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A wrapper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(), gamma,
+                               alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = py_focal_loss_with_prob
+            else:
+                if torch.cuda.is_available() and pred.is_cuda:
+                    calculate_loss_func = sigmoid_focal_loss
+                else:
+                    num_classes = pred.size(1)
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    target = target[:, :num_classes]
+                    calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/mmdet/models/losses/gaussian_focal_loss.py b/mmdet/models/losses/gaussian_focal_loss.py
new file mode 100755
index 0000000..7abcb69
--- /dev/null
+++ b/mmdet/models/losses/gaussian_focal_loss.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def gaussian_focal_loss(pred, gaussian_target, alpha=2.0, gamma=4.0):
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+    """
+    eps = 1e-12
+    pos_weights = gaussian_target.eq(1)
+    neg_weights = (1 - gaussian_target).pow(gamma)
+    pos_loss = -(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    return pos_loss + neg_loss
+
+
+@LOSSES.register_module()
+class GaussianFocalLoss(nn.Module):
+    """GaussianFocalLoss is a variant of focal loss.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_
+    Code is modified from `kp_utils.py
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L152>`_  # noqa: E501
+    Please notice that the target in GaussianFocalLoss is a gaussian heatmap,
+    not 0/1 binary target.
+
+    Args:
+        alpha (float): Power of prediction.
+        gamma (float): Power of target for negative samples.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 alpha=2.0,
+                 gamma=4.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(GaussianFocalLoss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction
+                in gaussian distribution.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_reg = self.loss_weight * gaussian_focal_loss(
+            pred,
+            target,
+            weight,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_reg
diff --git a/mmdet/models/losses/gfocal_loss.py b/mmdet/models/losses/gfocal_loss.py
new file mode 100755
index 0000000..0e8d263
--- /dev/null
+++ b/mmdet/models/losses/gfocal_loss.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def quality_focal_loss(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred.sigmoid()
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy_with_logits(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def quality_focal_loss_with_prob(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Different from `quality_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def distribution_focal_loss(pred, label):
+    r"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (torch.Tensor): Target distance label for bounding boxes with
+            shape (N,).
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.long()
+    dis_right = dis_left + 1
+    weight_left = dis_right.float() - label
+    weight_right = label - dis_left.float()
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@LOSSES.register_module()
+class QualityFocalLoss(nn.Module):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+        activated (bool, optional): Whether the input is activated.
+            If True, it means the input has been activated and can be
+            treated as probabilities. Else, it should be treated as logits.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        super(QualityFocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid in QFL supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (tuple([torch.Tensor])): Target category label with shape
+                (N,) and target quality label with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = quality_focal_loss_with_prob
+            else:
+                calculate_loss_func = quality_focal_loss
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                beta=self.beta,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@LOSSES.register_module()
+class DistributionFocalLoss(nn.Module):
+    r"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (torch.Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * distribution_focal_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_cls
diff --git a/mmdet/models/losses/ghm_loss.py b/mmdet/models/losses/ghm_loss.py
new file mode 100755
index 0000000..a4df9fe
--- /dev/null
+++ b/mmdet/models/losses/ghm_loss.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(
+        (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+    bin_label_weights = label_weights.view(-1, 1).expand(
+        label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module()
+class GHMC(nn.Module):
+    """GHM Classification Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        use_sigmoid (bool): Can only be true for BCE based loss now.
+        loss_weight (float): The weight of the total GHM-C loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 bins=10,
+                 momentum=0,
+                 use_sigmoid=True,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMC, self).__init__()
+        self.bins = bins
+        self.momentum = momentum
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] += 1e-6
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.use_sigmoid = use_sigmoid
+        if not self.use_sigmoid:
+            raise NotImplementedError
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                reduction_override=None,
+                **kwargs):
+        """Calculate the GHM-C loss.
+
+        Args:
+            pred (float tensor of size [batch_num, class_num]):
+                The direct prediction of classification fc layer.
+            target (float tensor of size [batch_num, class_num]):
+                Binary class target for each sample.
+            label_weight (float tensor of size [batch_num, class_num]):
+                the value is 1 if the sample is valid and 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # the target should be binary class label
+        if pred.dim() != target.dim():
+            target, label_weight = _expand_onehot_labels(
+                target, label_weight, pred.size(-1))
+        target, label_weight = target.float(), label_weight.float()
+        edges = self.edges
+        mmt = self.momentum
+        weights = torch.zeros_like(pred)
+
+        # gradient length
+        g = torch.abs(pred.sigmoid().detach() - target)
+
+        valid = label_weight > 0
+        tot = max(valid.float().sum().item(), 1.0)
+        n = 0  # n valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+                n += 1
+        if n > 0:
+            weights = weights / n
+
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, reduction='none')
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module()
+class GHMR(nn.Module):
+    """GHM Regression Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        mu (float): The parameter for the Authentic Smooth L1 loss.
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        loss_weight (float): The weight of the total GHM-R loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 mu=0.02,
+                 bins=10,
+                 momentum=0,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMR, self).__init__()
+        self.mu = mu
+        self.bins = bins
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] = 1e3
+        self.momentum = momentum
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    # TODO: support reduction parameter
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                avg_factor=None,
+                reduction_override=None):
+        """Calculate the GHM-R loss.
+
+        Args:
+            pred (float tensor of size [batch_num, 4 (* class_num)]):
+                The prediction of box regression layer. Channel number can be 4
+                or 4 * class_num depending on whether it is class-agnostic.
+            target (float tensor of size [batch_num, 4 (* class_num)]):
+                The target regression values with the same size of pred.
+            label_weight (float tensor of size [batch_num, 4 (* class_num)]):
+                The weight of each sample, 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        mu = self.mu
+        edges = self.edges
+        mmt = self.momentum
+
+        # ASL1 loss
+        diff = pred - target
+        loss = torch.sqrt(diff * diff + mu * mu) - mu
+
+        # gradient length
+        g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach()
+        weights = torch.zeros_like(g)
+
+        valid = label_weight > 0
+        tot = max(label_weight.float().sum().item(), 1.0)
+        n = 0  # n: valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                n += 1
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+        if n > 0:
+            weights /= n
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
diff --git a/mmdet/models/losses/iou_loss.py b/mmdet/models/losses/iou_loss.py
new file mode 100755
index 0000000..bf1ed04
--- /dev/null
+++ b/mmdet/models/losses/iou_loss.py
@@ -0,0 +1,474 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import mmcv
+import torch
+import torch.nn as nn
+
+from mmdet.core import bbox_overlaps
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def iou_loss(pred, target, linear=False, mode='log', eps=1e-6):
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    assert mode in ['linear', 'square', 'log']
+    if linear:
+        mode = 'linear'
+        warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                      'iou_loss is deprecated, please use "mode=`linear`" '
+                      'instead.')
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+    if mode == 'linear':
+        loss = 1 - ious
+    elif mode == 'square':
+        loss = 1 - ious**2
+    elif mode == 'log':
+        loss = -ious.log()
+    else:
+        raise NotImplementedError
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3):
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes.
+        target (torch.Tensor): Target bboxes.
+        beta (float): beta parameter in smoothl1.
+        eps (float): eps to avoid NaN.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0]
+    pred_h = pred[:, 3] - pred[:, 1]
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0]
+        target_h = target[:, 3] - target[:, 1]
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    # view(..., -1) does not work for empty tensor
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).flatten(1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def giou_loss(pred, target, eps=1e-7):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+    loss = 1 - gious
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def diou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression, https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    # DIoU
+    dious = ious - rho2 / c2
+    loss = 1 - dious
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def ciou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    factor = 4 / math.pi**2
+    v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+    with torch.no_grad():
+        alpha = (ious > 0.5).float() * v / (1 - ious + v)
+
+    # CIoU
+    cious = ious - (rho2 / c2 + alpha * v)
+    loss = 1 - cious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+@LOSSES.register_module()
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        linear (bool): If True, use linear scale of loss else determined
+            by mode. Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+    """
+
+    def __init__(self,
+                 linear=False,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 mode='log'):
+        super(IoULoss, self).__init__()
+        assert mode in ['linear', 'square', 'log']
+        if linear:
+            mode = 'linear'
+            warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                          'IOULoss is deprecated, please use "mode=`linear`" '
+                          'instead.')
+        self.mode = mode
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(
+            pred,
+            target,
+            weight,
+            mode=self.mode,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class BoundedIoULoss(nn.Module):
+
+    def __init__(self, beta=0.2, eps=1e-3, reduction='mean', loss_weight=1.0):
+        super(BoundedIoULoss, self).__init__()
+        self.beta = beta
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * bounded_iou_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class GIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(GIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class DIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(DIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class CIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(CIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * ciou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
diff --git a/mmdet/models/losses/kd_loss.py b/mmdet/models/losses/kd_loss.py
new file mode 100755
index 0000000..75c1935
--- /dev/null
+++ b/mmdet/models/losses/kd_loss.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def knowledge_distillation_kl_div_loss(pred,
+                                       soft_label,
+                                       T,
+                                       detach_target=True):
+    r"""Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        pred (Tensor): Predicted logits with shape (N, n + 1).
+        soft_label (Tensor): Target logits with shape (N, N + 1).
+        T (int): Temperature for distillation.
+        detach_target (bool): Remove soft_label from automatic differentiation
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert pred.size() == soft_label.size()
+    target = F.softmax(soft_label / T, dim=1)
+    if detach_target:
+        target = target.detach()
+
+    kd_loss = F.kl_div(
+        F.log_softmax(pred / T, dim=1), target, reduction='none').mean(1) * (
+            T * T)
+
+    return kd_loss
+
+
+@LOSSES.register_module()
+class KnowledgeDistillationKLDivLoss(nn.Module):
+    """Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+        T (int): Temperature for distillation.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0, T=10):
+        super(KnowledgeDistillationKLDivLoss, self).__init__()
+        assert T >= 1
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.T = T
+
+    def forward(self,
+                pred,
+                soft_label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted logits with shape (N, n + 1).
+            soft_label (Tensor): Target logits with shape (N, N + 1).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_kd = self.loss_weight * knowledge_distillation_kl_div_loss(
+            pred,
+            soft_label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            T=self.T)
+
+        return loss_kd
diff --git a/mmdet/models/losses/mse_loss.py b/mmdet/models/losses/mse_loss.py
new file mode 100755
index 0000000..2ebd161
--- /dev/null
+++ b/mmdet/models/losses/mse_loss.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def mse_loss(pred, target):
+    """Wrapper of mse loss."""
+    return F.mse_loss(pred, target, reduction='none')
+
+
+@LOSSES.register_module()
+class MSELoss(nn.Module):
+    """MSELoss.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * mse_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
diff --git a/mmdet/models/losses/pisa_loss.py b/mmdet/models/losses/pisa_loss.py
new file mode 100755
index 0000000..6afea0e
--- /dev/null
+++ b/mmdet/models/losses/pisa_loss.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.core import bbox_overlaps
+
+
+@mmcv.jit(derivate=True, coderize=True)
+def isr_p(cls_score,
+          bbox_pred,
+          bbox_targets,
+          rois,
+          sampling_results,
+          loss_cls,
+          bbox_coder,
+          k=2,
+          bias=0,
+          num_class=80):
+    """Importance-based Sample Reweighting (ISR_P), positive part.
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (tuple[Tensor]): A tuple of bbox targets, the are
+            labels, label_weights, bbox_targets, bbox_weights, respectively.
+        rois (Tensor): Anchors (single_stage) in shape (n, 4) or RoIs
+            (two_stage) in shape (n, 5).
+        sampling_results (obj): Sampling results.
+        loss_cls (func): Classification loss func of the head.
+        bbox_coder (obj): BBox coder of the head.
+        k (float): Power of the non-linear mapping.
+        bias (float): Shift of the non-linear mapping.
+        num_class (int): Number of classes, default: 80.
+
+    Return:
+        tuple([Tensor]): labels, imp_based_label_weights, bbox_targets,
+            bbox_target_weights
+    """
+
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    pos_labels = labels[pos_label_inds]
+
+    # if no positive samples, return the original targets
+    num_pos = float(pos_label_inds.size(0))
+    if num_pos == 0:
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    # merge pos_assigned_gt_inds of per image to a single tensor
+    gts = list()
+    last_max_gt = 0
+    for i in range(len(sampling_results)):
+        gt_i = sampling_results[i].pos_assigned_gt_inds
+        gts.append(gt_i + last_max_gt)
+        if len(gt_i) != 0:
+            last_max_gt = gt_i.max() + 1
+    gts = torch.cat(gts)
+    assert len(gts) == num_pos
+
+    cls_score = cls_score.detach()
+    bbox_pred = bbox_pred.detach()
+
+    # For single stage detectors, rois here indicate anchors, in shape (N, 4)
+    # For two stage detectors, rois are in shape (N, 5)
+    if rois.size(-1) == 5:
+        pos_rois = rois[pos_label_inds][:, 1:]
+    else:
+        pos_rois = rois[pos_label_inds]
+
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_delta_pred = bbox_pred[pos_label_inds, pos_labels].view(-1, 4)
+    else:
+        pos_delta_pred = bbox_pred[pos_label_inds].view(-1, 4)
+
+    # compute iou of the predicted bbox and the corresponding GT
+    pos_delta_target = bbox_targets[pos_label_inds].view(-1, 4)
+    pos_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_pred)
+    target_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_target)
+    ious = bbox_overlaps(pos_bbox_pred, target_bbox_pred, is_aligned=True)
+
+    pos_imp_weights = label_weights[pos_label_inds]
+    # Two steps to compute IoU-HLR. Samples are first sorted by IoU locally,
+    # then sorted again within the same-rank group
+    max_l_num = pos_labels.bincount().max()
+    for label in pos_labels.unique():
+        l_inds = (pos_labels == label).nonzero().view(-1)
+        l_gts = gts[l_inds]
+        for t in l_gts.unique():
+            t_inds = l_inds[l_gts == t]
+            t_ious = ious[t_inds]
+            _, t_iou_rank_idx = t_ious.sort(descending=True)
+            _, t_iou_rank = t_iou_rank_idx.sort()
+            ious[t_inds] += max_l_num - t_iou_rank.float()
+        l_ious = ious[l_inds]
+        _, l_iou_rank_idx = l_ious.sort(descending=True)
+        _, l_iou_rank = l_iou_rank_idx.sort()  # IoU-HLR
+        # linearly map HLR to label weights
+        pos_imp_weights[l_inds] *= (max_l_num - l_iou_rank.float()) / max_l_num
+
+    pos_imp_weights = (bias + pos_imp_weights * (1 - bias)).pow(k)
+
+    # normalize to make the new weighted loss value equal to the original loss
+    pos_loss_cls = loss_cls(
+        cls_score[pos_label_inds], pos_labels, reduction_override='none')
+    if pos_loss_cls.dim() > 1:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds][:,
+                                                                        None]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights[:, None]
+    else:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights
+    pos_loss_cls_ratio = ori_pos_loss_cls.sum() / new_pos_loss_cls.sum()
+    pos_imp_weights = pos_imp_weights * pos_loss_cls_ratio
+    label_weights[pos_label_inds] = pos_imp_weights
+
+    bbox_targets = labels, label_weights, bbox_targets, bbox_weights
+    return bbox_targets
+
+
+@mmcv.jit(derivate=True, coderize=True)
+def carl_loss(cls_score,
+              labels,
+              bbox_pred,
+              bbox_targets,
+              loss_bbox,
+              k=1,
+              bias=0.2,
+              avg_factor=None,
+              sigmoid=False,
+              num_class=80):
+    """Classification-Aware Regression Loss (CARL).
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        labels (Tensor): Targets of classification.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (Tensor): Target of bbox regression.
+        loss_bbox (func): Regression loss func of the head.
+        bbox_coder (obj): BBox coder of the head.
+        k (float): Power of the non-linear mapping.
+        bias (float): Shift of the non-linear mapping.
+        avg_factor (int): Average factor used in regression loss.
+        sigmoid (bool): Activation of the classification score.
+        num_class (int): Number of classes, default: 80.
+
+    Return:
+        dict: CARL loss dict.
+    """
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    if pos_label_inds.numel() == 0:
+        return dict(loss_carl=cls_score.sum()[None] * 0.)
+    pos_labels = labels[pos_label_inds]
+
+    # multiply pos_cls_score with the corresponding bbox weight
+    # and remain gradient
+    if sigmoid:
+        pos_cls_score = cls_score.sigmoid()[pos_label_inds, pos_labels]
+    else:
+        pos_cls_score = cls_score.softmax(-1)[pos_label_inds, pos_labels]
+    carl_loss_weights = (bias + (1 - bias) * pos_cls_score).pow(k)
+
+    # normalize carl_loss_weight to make its sum equal to num positive
+    num_pos = float(pos_cls_score.size(0))
+    weight_ratio = num_pos / carl_loss_weights.sum()
+    carl_loss_weights *= weight_ratio
+
+    if avg_factor is None:
+        avg_factor = bbox_targets.size(0)
+    # if is class agnostic, bbox pred is in shape (N, 4)
+    # otherwise, bbox pred is in shape (N, #classes, 4)
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_bbox_preds = bbox_pred[pos_label_inds, pos_labels]
+    else:
+        pos_bbox_preds = bbox_pred[pos_label_inds]
+    ori_loss_reg = loss_bbox(
+        pos_bbox_preds,
+        bbox_targets[pos_label_inds],
+        reduction_override='none') / avg_factor
+    loss_carl = (ori_loss_reg * carl_loss_weights[:, None]).sum()
+    return dict(loss_carl=loss_carl[None])
diff --git a/mmdet/models/losses/seesaw_loss.py b/mmdet/models/losses/seesaw_loss.py
new file mode 100755
index 0000000..0104047
--- /dev/null
+++ b/mmdet/models/losses/seesaw_loss.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .accuracy import accuracy
+from .cross_entropy_loss import cross_entropy
+from .utils import weight_reduce_loss
+
+
+def seesaw_ce_loss(cls_score,
+                   labels,
+                   label_weights,
+                   cum_samples,
+                   num_classes,
+                   p,
+                   q,
+                   eps,
+                   reduction='mean',
+                   avg_factor=None):
+    """Calculate the Seesaw CrossEntropy loss.
+
+    Args:
+        cls_score (torch.Tensor): The prediction with shape (N, C),
+             C is the number of classes.
+        labels (torch.Tensor): The learning label of the prediction.
+        label_weights (torch.Tensor): Sample-wise loss weight.
+        cum_samples (torch.Tensor): Cumulative samples for each category.
+        num_classes (int): The number of classes.
+        p (float): The ``p`` in the mitigation factor.
+        q (float): The ``q`` in the compenstation factor.
+        eps (float): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert cls_score.size(-1) == num_classes
+    assert len(cum_samples) == num_classes
+
+    onehot_labels = F.one_hot(labels, num_classes)
+    seesaw_weights = cls_score.new_ones(onehot_labels.size())
+
+    # mitigation factor
+    if p > 0:
+        sample_ratio_matrix = cum_samples[None, :].clamp(
+            min=1) / cum_samples[:, None].clamp(min=1)
+        index = (sample_ratio_matrix < 1.0).float()
+        sample_weights = sample_ratio_matrix.pow(p) * index + (1 - index)
+        mitigation_factor = sample_weights[labels.long(), :]
+        seesaw_weights = seesaw_weights * mitigation_factor
+
+    # compensation factor
+    if q > 0:
+        scores = F.softmax(cls_score.detach(), dim=1)
+        self_scores = scores[
+            torch.arange(0, len(scores)).to(scores.device).long(),
+            labels.long()]
+        score_matrix = scores / self_scores[:, None].clamp(min=eps)
+        index = (score_matrix > 1.0).float()
+        compensation_factor = score_matrix.pow(q) * index + (1 - index)
+        seesaw_weights = seesaw_weights * compensation_factor
+
+    cls_score = cls_score + (seesaw_weights.log() * (1 - onehot_labels))
+
+    loss = F.cross_entropy(cls_score, labels, weight=None, reduction='none')
+
+    if label_weights is not None:
+        label_weights = label_weights.float()
+    loss = weight_reduce_loss(
+        loss, weight=label_weights, reduction=reduction, avg_factor=avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class SeesawLoss(nn.Module):
+    """
+    Seesaw Loss for Long-Tailed Instance Segmentation (CVPR 2021)
+    arXiv: https://arxiv.org/abs/2008.10032
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+             of softmax. Only False is supported.
+        p (float, optional): The ``p`` in the mitigation factor.
+             Defaults to 0.8.
+        q (float, optional): The ``q`` in the compenstation factor.
+             Defaults to 2.0.
+        num_classes (int, optional): The number of classes.
+             Default to 1203 for LVIS v1 dataset.
+        eps (float, optional): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method that reduces the loss to a
+             scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        return_dict (bool, optional): Whether return the losses as a dict.
+             Default to True.
+    """
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 p=0.8,
+                 q=2.0,
+                 num_classes=1203,
+                 eps=1e-2,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 return_dict=True):
+        super(SeesawLoss, self).__init__()
+        assert not use_sigmoid
+        self.use_sigmoid = False
+        self.p = p
+        self.q = q
+        self.num_classes = num_classes
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.return_dict = return_dict
+
+        # 0 for pos, 1 for neg
+        self.cls_criterion = seesaw_ce_loss
+
+        # cumulative samples for each category
+        self.register_buffer(
+            'cum_samples',
+            torch.zeros(self.num_classes + 1, dtype=torch.float))
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def _split_cls_score(self, cls_score):
+        # split cls_score to cls_score_classes and cls_score_objectness
+        assert cls_score.size(-1) == self.num_classes + 2
+        cls_score_classes = cls_score[..., :-2]
+        cls_score_objectness = cls_score[..., -2:]
+        return cls_score_classes, cls_score_objectness
+
+    def get_cls_channels(self, num_classes):
+        """Get custom classification channels.
+
+        Args:
+            num_classes (int): The number of classes.
+
+        Returns:
+            int: The custom classification channels.
+        """
+        assert num_classes == self.num_classes
+        return num_classes + 2
+
+    def get_activation(self, cls_score):
+        """Get custom activation of cls_score.
+
+        Args:
+            cls_score (torch.Tensor): The prediction with shape (N, C + 2).
+
+        Returns:
+            torch.Tensor: The custom activation of cls_score with shape
+                 (N, C + 1).
+        """
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        score_classes = F.softmax(cls_score_classes, dim=-1)
+        score_objectness = F.softmax(cls_score_objectness, dim=-1)
+        score_pos = score_objectness[..., [0]]
+        score_neg = score_objectness[..., [1]]
+        score_classes = score_classes * score_pos
+        scores = torch.cat([score_classes, score_neg], dim=-1)
+        return scores
+
+    def get_accuracy(self, cls_score, labels):
+        """Get custom accuracy w.r.t. cls_score and labels.
+
+        Args:
+            cls_score (torch.Tensor): The prediction with shape (N, C + 2).
+            labels (torch.Tensor): The learning label of the prediction.
+
+        Returns:
+            Dict [str, torch.Tensor]: The accuracy for objectness and classes,
+                 respectively.
+        """
+        pos_inds = labels < self.num_classes
+        obj_labels = (labels == self.num_classes).long()
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        acc_objectness = accuracy(cls_score_objectness, obj_labels)
+        acc_classes = accuracy(cls_score_classes[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_objectness'] = acc_objectness
+        acc['acc_classes'] = acc_classes
+        return acc
+
+    def forward(self,
+                cls_score,
+                labels,
+                label_weights=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The prediction with shape (N, C + 2).
+            labels (torch.Tensor): The learning label of the prediction.
+            label_weights (torch.Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                 the loss. Defaults to None.
+            reduction (str, optional): The method used to reduce the loss.
+                 Options are "none", "mean" and "sum".
+        Returns:
+            torch.Tensor | Dict [str, torch.Tensor]:
+                 if return_dict == False: The calculated loss |
+                 if return_dict == True: The dict of calculated losses
+                 for objectness and classes, respectively.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        assert cls_score.size(-1) == self.num_classes + 2
+        pos_inds = labels < self.num_classes
+        # 0 for pos, 1 for neg
+        obj_labels = (labels == self.num_classes).long()
+
+        # accumulate the samples for each category
+        unique_labels = labels.unique()
+        for u_l in unique_labels:
+            inds_ = labels == u_l.item()
+            self.cum_samples[u_l] += inds_.sum()
+
+        if label_weights is not None:
+            label_weights = label_weights.float()
+        else:
+            label_weights = labels.new_ones(labels.size(), dtype=torch.float)
+
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        # calculate loss_cls_classes (only need pos samples)
+        if pos_inds.sum() > 0:
+            loss_cls_classes = self.loss_weight * self.cls_criterion(
+                cls_score_classes[pos_inds], labels[pos_inds],
+                label_weights[pos_inds], self.cum_samples[:self.num_classes],
+                self.num_classes, self.p, self.q, self.eps, reduction,
+                avg_factor)
+        else:
+            loss_cls_classes = cls_score_classes[pos_inds].sum()
+        # calculate loss_cls_objectness
+        loss_cls_objectness = self.loss_weight * cross_entropy(
+            cls_score_objectness, obj_labels, label_weights, reduction,
+            avg_factor)
+
+        if self.return_dict:
+            loss_cls = dict()
+            loss_cls['loss_cls_objectness'] = loss_cls_objectness
+            loss_cls['loss_cls_classes'] = loss_cls_classes
+        else:
+            loss_cls = loss_cls_classes + loss_cls_objectness
+        return loss_cls
diff --git a/mmdet/models/losses/smooth_l1_loss.py b/mmdet/models/losses/smooth_l1_loss.py
new file mode 100755
index 0000000..5511746
--- /dev/null
+++ b/mmdet/models/losses/smooth_l1_loss.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def smooth_l1_loss(pred, target, beta=1.0):
+    """Smooth L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@LOSSES.register_module()
+class SmoothL1Loss(nn.Module):
+    """Smooth L1 loss.
+
+    Args:
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, beta=1.0, reduction='mean', loss_weight=1.0):
+        super(SmoothL1Loss, self).__init__()
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(L1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
diff --git a/mmdet/models/losses/utils.py b/mmdet/models/losses/utils.py
new file mode 100755
index 0000000..778237e
--- /dev/null
+++ b/mmdet/models/losses/utils.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+import mmcv
+import torch
+import torch.nn.functional as F
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+@mmcv.jit(derivate=True, coderize=True)
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/mmdet/models/losses/varifocal_loss.py b/mmdet/models/losses/varifocal_loss.py
new file mode 100755
index 0000000..42f0eef
--- /dev/null
+++ b/mmdet/models/losses/varifocal_loss.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+def varifocal_loss(pred,
+                   target,
+                   weight=None,
+                   alpha=0.75,
+                   gamma=2.0,
+                   iou_weighted=True,
+                   reduction='mean',
+                   avg_factor=None):
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction. Defaults to None.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and
+            "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    else:
+        focal_weight = (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class VarifocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.75,
+                 gamma=2.0,
+                 iou_weighted=True,
+                 reduction='mean',
+                 loss_weight=1.0):
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super(VarifocalLoss, self).__init__()
+        assert use_sigmoid is True, \
+            'Only sigmoid varifocal loss supported now.'
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * varifocal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                iou_weighted=self.iou_weighted,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/mmdet/models/necks/__init__.py b/mmdet/models/necks/__init__.py
new file mode 100755
index 0000000..6f2fa82
--- /dev/null
+++ b/mmdet/models/necks/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bfp import BFP
+from .channel_mapper import ChannelMapper
+from .ct_resnet_neck import CTResNetNeck
+from .dilated_encoder import DilatedEncoder
+from .dyhead import DyHead
+from .fpg import FPG
+from .fpn import FPN
+from .fpn_carafe import FPN_CARAFE
+from .hrfpn import HRFPN
+from .nas_fpn import NASFPN
+from .nasfcos_fpn import NASFCOS_FPN
+from .pafpn import PAFPN
+from .rfp import RFP
+from .ssd_neck import SSDNeck
+from .yolo_neck import YOLOV3Neck
+from .yolox_pafpn import YOLOXPAFPN
+
+__all__ = [
+    'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN',
+    'NASFCOS_FPN', 'RFP', 'YOLOV3Neck', 'FPG', 'DilatedEncoder',
+    'CTResNetNeck', 'SSDNeck', 'YOLOXPAFPN', 'DyHead'
+]
diff --git a/mmdet/models/necks/bfp.py b/mmdet/models/necks/bfp.py
new file mode 100755
index 0000000..9fdfa03
--- /dev/null
+++ b/mmdet/models/necks/bfp.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import NonLocal2d
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class BFP(BaseModule):
+    """BFP (Balanced Feature Pyramids)
+
+    BFP takes multi-level features as inputs and gather them into a single one,
+    then refine the gathered feature and scatter the refined results to
+    multi-level features. This module is used in Libra R-CNN (CVPR 2019), see
+    the paper `Libra R-CNN: Towards Balanced Learning for Object Detection
+    <https://arxiv.org/abs/1904.02701>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        num_levels (int): Number of input feature levels.
+        conv_cfg (dict): The config dict for convolution layers.
+        norm_cfg (dict): The config dict for normalization layers.
+        refine_level (int): Index of integration and refine level of BSF in
+            multi-level features from bottom to top.
+        refine_type (str): Type of the refine op, currently support
+            [None, 'conv', 'non_local'].
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_levels,
+                 refine_level=2,
+                 refine_type=None,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(BFP, self).__init__(init_cfg)
+        assert refine_type in [None, 'conv', 'non_local']
+
+        self.in_channels = in_channels
+        self.num_levels = num_levels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.refine_level = refine_level
+        self.refine_type = refine_type
+        assert 0 <= self.refine_level < self.num_levels
+
+        if self.refine_type == 'conv':
+            self.refine = ConvModule(
+                self.in_channels,
+                self.in_channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        elif self.refine_type == 'non_local':
+            self.refine = NonLocal2d(
+                self.in_channels,
+                reduction=1,
+                use_scale=False,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == self.num_levels
+
+        # step 1: gather multi-level features by resize and average
+        feats = []
+        gather_size = inputs[self.refine_level].size()[2:]
+        for i in range(self.num_levels):
+            if i < self.refine_level:
+                gathered = F.adaptive_max_pool2d(
+                    inputs[i], output_size=gather_size)
+            else:
+                gathered = F.interpolate(
+                    inputs[i], size=gather_size, mode='nearest')
+            feats.append(gathered)
+
+        bsf = sum(feats) / len(feats)
+
+        # step 2: refine gathered features
+        if self.refine_type is not None:
+            bsf = self.refine(bsf)
+
+        # step 3: scatter refined features to multi-levels by a residual path
+        outs = []
+        for i in range(self.num_levels):
+            out_size = inputs[i].size()[2:]
+            if i < self.refine_level:
+                residual = F.interpolate(bsf, size=out_size, mode='nearest')
+            else:
+                residual = F.adaptive_max_pool2d(bsf, output_size=out_size)
+            outs.append(residual + inputs[i])
+
+        return tuple(outs)
diff --git a/mmdet/models/necks/channel_mapper.py b/mmdet/models/necks/channel_mapper.py
new file mode 100755
index 0000000..774bdb1
--- /dev/null
+++ b/mmdet/models/necks/channel_mapper.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class ChannelMapper(BaseModule):
+    r"""Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU').
+        num_outs (int, optional): Number of output feature maps. There
+            would be extra_convs when num_outs larger than the length
+            of in_channels.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 num_outs=None,
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(ChannelMapper, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.extra_convs = None
+        if num_outs is None:
+            num_outs = len(in_channels)
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        if num_outs > len(in_channels):
+            self.extra_convs = nn.ModuleList()
+            for i in range(len(in_channels), num_outs):
+                if i == len(in_channels):
+                    in_channel = in_channels[-1]
+                else:
+                    in_channel = out_channels
+                self.extra_convs.append(
+                    ConvModule(
+                        in_channel,
+                        out_channels,
+                        3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        if self.extra_convs:
+            for i in range(len(self.extra_convs)):
+                if i == 0:
+                    outs.append(self.extra_convs[0](inputs[-1]))
+                else:
+                    outs.append(self.extra_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmdet/models/necks/ct_resnet_neck.py b/mmdet/models/necks/ct_resnet_neck.py
new file mode 100755
index 0000000..40eb268
--- /dev/null
+++ b/mmdet/models/necks/ct_resnet_neck.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16
+
+from mmdet.models.builder import NECKS
+
+
+@NECKS.register_module()
+class CTResNetNeck(BaseModule):
+    """The neck used in `CenterNet <https://arxiv.org/abs/1904.07850>`_ for
+    object classification and box regression.
+
+    Args:
+         in_channel (int): Number of input channels.
+         num_deconv_filters (tuple[int]): Number of filters per stage.
+         num_deconv_kernels (tuple[int]): Number of kernels per stage.
+         use_dcn (bool): If True, use DCNv2. Default: True.
+         init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channel,
+                 num_deconv_filters,
+                 num_deconv_kernels,
+                 use_dcn=True,
+                 init_cfg=None):
+        super(CTResNetNeck, self).__init__(init_cfg)
+        assert len(num_deconv_filters) == len(num_deconv_kernels)
+        self.fp16_enabled = False
+        self.use_dcn = use_dcn
+        self.in_channel = in_channel
+        self.deconv_layers = self._make_deconv_layer(num_deconv_filters,
+                                                     num_deconv_kernels)
+
+    def _make_deconv_layer(self, num_deconv_filters, num_deconv_kernels):
+        """use deconv layers to upsample backbone's output."""
+        layers = []
+        for i in range(len(num_deconv_filters)):
+            feat_channel = num_deconv_filters[i]
+            conv_module = ConvModule(
+                self.in_channel,
+                feat_channel,
+                3,
+                padding=1,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=dict(type='BN'))
+            layers.append(conv_module)
+            upsample_module = ConvModule(
+                feat_channel,
+                feat_channel,
+                num_deconv_kernels[i],
+                stride=2,
+                padding=1,
+                conv_cfg=dict(type='deconv'),
+                norm_cfg=dict(type='BN'))
+            layers.append(upsample_module)
+            self.in_channel = feat_channel
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                # In order to be consistent with the source code,
+                # reset the ConvTranspose2d initialization parameters
+                m.reset_parameters()
+                # Simulated bilinear upsampling kernel
+                w = m.weight.data
+                f = math.ceil(w.size(2) / 2)
+                c = (2 * f - 1 - f % 2) / (2. * f)
+                for i in range(w.size(2)):
+                    for j in range(w.size(3)):
+                        w[0, 0, i, j] = \
+                            (1 - math.fabs(i / f - c)) * (
+                                    1 - math.fabs(j / f - c))
+                for c in range(1, w.size(0)):
+                    w[c, 0, :, :] = w[0, 0, :, :]
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            # self.use_dcn is False
+            elif not self.use_dcn and isinstance(m, nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
+
+    @auto_fp16()
+    def forward(self, inputs):
+        assert isinstance(inputs, (list, tuple))
+        outs = self.deconv_layers(inputs[-1])
+        return outs,
diff --git a/mmdet/models/necks/dilated_encoder.py b/mmdet/models/necks/dilated_encoder.py
new file mode 100755
index 0000000..79a8f4b
--- /dev/null
+++ b/mmdet/models/necks/dilated_encoder.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, caffe2_xavier_init, constant_init, is_norm,
+                      normal_init)
+from torch.nn import BatchNorm2d
+
+from ..builder import NECKS
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for DilatedEncoder used in `YOLOF.
+
+    <https://arxiv.org/abs/2103.09460>`.
+
+    The Bottleneck contains three ConvLayers and one residual connection.
+
+    Args:
+        in_channels (int): The number of input channels.
+        mid_channels (int): The number of middle output channels.
+        dilation (int): Dilation rate.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 dilation,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        super(Bottleneck, self).__init__()
+        self.conv1 = ConvModule(
+            in_channels, mid_channels, 1, norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            mid_channels,
+            mid_channels,
+            3,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg)
+        self.conv3 = ConvModule(
+            mid_channels, in_channels, 1, norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        out = out + identity
+        return out
+
+
+@NECKS.register_module()
+class DilatedEncoder(nn.Module):
+    """Dilated Encoder for YOLOF <https://arxiv.org/abs/2103.09460>`.
+
+    This module contains two types of components:
+        - the original FPN lateral convolution layer and fpn convolution layer,
+              which are 1x1 conv + 3x3 conv
+        - the dilated residual block
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        block_mid_channels (int): The number of middle block output channels
+        num_residual_blocks (int): The number of residual blocks.
+        block_dilations (list): The list of residual blocks dilation.
+    """
+
+    def __init__(self, in_channels, out_channels, block_mid_channels,
+                 num_residual_blocks, block_dilations):
+        super(DilatedEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.block_mid_channels = block_mid_channels
+        self.num_residual_blocks = num_residual_blocks
+        self.block_dilations = block_dilations
+        self._init_layers()
+
+    def _init_layers(self):
+        self.lateral_conv = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=1)
+        self.lateral_norm = BatchNorm2d(self.out_channels)
+        self.fpn_conv = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=3, padding=1)
+        self.fpn_norm = BatchNorm2d(self.out_channels)
+        encoder_blocks = []
+        for i in range(self.num_residual_blocks):
+            dilation = self.block_dilations[i]
+            encoder_blocks.append(
+                Bottleneck(
+                    self.out_channels,
+                    self.block_mid_channels,
+                    dilation=dilation))
+        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)
+
+    def init_weights(self):
+        caffe2_xavier_init(self.lateral_conv)
+        caffe2_xavier_init(self.fpn_conv)
+        for m in [self.lateral_norm, self.fpn_norm]:
+            constant_init(m, 1)
+        for m in self.dilated_encoder_blocks.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+    def forward(self, feature):
+        out = self.lateral_norm(self.lateral_conv(feature[-1]))
+        out = self.fpn_norm(self.fpn_conv(out))
+        return self.dilated_encoder_blocks(out),
diff --git a/mmdet/models/necks/dyhead.py b/mmdet/models/necks/dyhead.py
new file mode 100755
index 0000000..649bb4c
--- /dev/null
+++ b/mmdet/models/necks/dyhead.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (build_activation_layer, build_norm_layer, constant_init,
+                      normal_init)
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+from ..utils import DyReLU
+
+# Reference:
+# https://github.com/microsoft/DynamicHead
+# https://github.com/jshilong/SEPC
+
+
+class DyDCNv2(nn.Module):
+    """ModulatedDeformConv2d with normalization layer used in DyHead.
+
+    This module cannot be configured with `conv_cfg=dict(type='DCNv2')`
+    because DyHead calculates offset and mask from middle-level feature.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int | tuple[int], optional): Stride of the convolution.
+            Default: 1.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='GN', num_groups=16, requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)):
+        super().__init__()
+        self.with_norm = norm_cfg is not None
+        bias = not self.with_norm
+        self.conv = ModulatedDeformConv2d(
+            in_channels, out_channels, 3, stride=stride, padding=1, bias=bias)
+        if self.with_norm:
+            self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+
+    def forward(self, x, offset, mask):
+        """Forward function."""
+        x = self.conv(x.contiguous(), offset.contiguous(), mask)
+        if self.with_norm:
+            x = self.norm(x)
+        return x
+
+
+class DyHeadBlock(nn.Module):
+    """DyHead Block with three types of attention.
+
+    HSigmoid arguments in default act_cfg follow official code, not paper.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        act_cfg (dict, optional): Config dict for the last activation layer of
+            scale-aware attention. Default: dict(type='HSigmoid', bias=3.0,
+            divisor=6.0).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 zero_init_offset=True,
+                 act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)):
+        super().__init__()
+        self.zero_init_offset = zero_init_offset
+        # (offset_x, offset_y, mask) * kernel_size_y * kernel_size_x
+        self.offset_and_mask_dim = 3 * 3 * 3
+        self.offset_dim = 2 * 3 * 3
+
+        self.spatial_conv_high = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_mid = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_low = DyDCNv2(in_channels, out_channels, stride=2)
+        self.spatial_conv_offset = nn.Conv2d(
+            in_channels, self.offset_and_mask_dim, 3, padding=1)
+        self.scale_attn_module = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1), nn.Conv2d(out_channels, 1, 1),
+            nn.ReLU(inplace=True), build_activation_layer(act_cfg))
+        self.task_attn_module = DyReLU(out_channels)
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, 0, 0.01)
+        if self.zero_init_offset:
+            constant_init(self.spatial_conv_offset, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for level in range(len(x)):
+            # calculate offset and mask of DCNv2 from middle-level feature
+            offset_and_mask = self.spatial_conv_offset(x[level])
+            offset = offset_and_mask[:, :self.offset_dim, :, :]
+            mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid()
+
+            mid_feat = self.spatial_conv_mid(x[level], offset, mask)
+            sum_feat = mid_feat * self.scale_attn_module(mid_feat)
+            summed_levels = 1
+            if level > 0:
+                low_feat = self.spatial_conv_low(x[level - 1], offset, mask)
+                sum_feat = sum_feat + \
+                    low_feat * self.scale_attn_module(low_feat)
+                summed_levels += 1
+            if level < len(x) - 1:
+                # this upsample order is weird, but faster than natural order
+                # https://github.com/microsoft/DynamicHead/issues/25
+                high_feat = F.interpolate(
+                    self.spatial_conv_high(x[level + 1], offset, mask),
+                    size=x[level].shape[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                sum_feat = sum_feat + high_feat * \
+                    self.scale_attn_module(high_feat)
+                summed_levels += 1
+            outs.append(self.task_attn_module(sum_feat / summed_levels))
+
+        return outs
+
+
+@NECKS.register_module()
+class DyHead(BaseModule):
+    """DyHead neck consisting of multiple DyHead Blocks.
+
+    See `Dynamic Head: Unifying Object Detection Heads with Attentions
+    <https://arxiv.org/abs/2106.08322>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_blocks (int, optional): Number of DyHead Blocks. Default: 6.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=6,
+                 zero_init_offset=True,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.zero_init_offset = zero_init_offset
+
+        dyhead_blocks = []
+        for i in range(num_blocks):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            dyhead_blocks.append(
+                DyHeadBlock(
+                    in_channels,
+                    self.out_channels,
+                    zero_init_offset=zero_init_offset))
+        self.dyhead_blocks = nn.Sequential(*dyhead_blocks)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert isinstance(inputs, (tuple, list))
+        outs = self.dyhead_blocks(inputs)
+        return tuple(outs)
diff --git a/mmdet/models/necks/fpg.py b/mmdet/models/necks/fpg.py
new file mode 100755
index 0000000..a6a2a12
--- /dev/null
+++ b/mmdet/models/necks/fpg.py
@@ -0,0 +1,406 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+
+
+class Transition(BaseModule):
+    """Base class for transition.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels, init_cfg=None):
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+    def forward(x):
+        pass
+
+
+class UpInterpolationConv(Transition):
+    """A transition used for up-sampling.
+
+    Up-sample the input by interpolation then refines the feature by
+    a convolution layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Up-sampling factor. Default: 2.
+        mode (int): Interpolation mode. Default: nearest.
+        align_corners (bool): Whether align corners when interpolation.
+            Default: None.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale_factor=2,
+                 mode='nearest',
+                 align_corners=None,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.mode = mode
+        self.scale_factor = scale_factor
+        self.align_corners = align_corners
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, x):
+        x = F.interpolate(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners)
+        x = self.conv(x)
+        return x
+
+
+class LastConv(Transition):
+    """A transition used for refining the output of the last stage.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_inputs (int): Number of inputs of the FPN features.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_inputs,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.num_inputs = num_inputs
+        self.conv_out = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, inputs):
+        assert len(inputs) == self.num_inputs
+        return self.conv_out(inputs[-1])
+
+
+@NECKS.register_module()
+class FPG(BaseModule):
+    """FPG.
+
+    Implementation of `Feature Pyramid Grids (FPG)
+    <https://arxiv.org/abs/2004.03580>`_.
+    This implementation only gives the basic structure stated in the paper.
+    But users can implement different type of transitions to fully explore the
+    the potential power of the structure of FPG.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        paths (list[str]): Specify the path order of each stack level.
+            Each element in the list should be either 'bu' (bottom-up) or
+            'td' (top-down).
+        inter_channels (int): Number of inter channels.
+        same_up_trans (dict): Transition that goes down at the same stage.
+        same_down_trans (dict): Transition that goes up at the same stage.
+        across_lateral_trans (dict): Across-pathway same-stage
+        across_down_trans (dict): Across-pathway bottom-up connection.
+        across_up_trans (dict): Across-pathway top-down connection.
+        across_skip_trans (dict): Across-pathway skip connection.
+        output_trans (dict): Transition that trans the output of the
+            last stage.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    transition_types = {
+        'conv': ConvModule,
+        'interpolation_conv': UpInterpolationConv,
+        'last_conv': LastConv,
+    }
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 stack_times,
+                 paths,
+                 inter_channels=None,
+                 same_down_trans=None,
+                 same_up_trans=dict(
+                     type='conv', kernel_size=3, stride=2, padding=1),
+                 across_lateral_trans=dict(type='conv', kernel_size=1),
+                 across_down_trans=dict(type='conv', kernel_size=3),
+                 across_up_trans=None,
+                 across_skip_trans=dict(type='identity'),
+                 output_trans=dict(type='last_conv', kernel_size=3),
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 norm_cfg=None,
+                 skip_inds=None,
+                 init_cfg=[
+                     dict(type='Caffe2Xavier', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=[
+                             '_BatchNorm', '_InstanceNorm', 'GroupNorm',
+                             'LayerNorm'
+                         ],
+                         val=1.0)
+                 ]):
+        super(FPG, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        if inter_channels is None:
+            self.inter_channels = [out_channels for _ in range(num_outs)]
+        elif isinstance(inter_channels, int):
+            self.inter_channels = [inter_channels for _ in range(num_outs)]
+        else:
+            assert isinstance(inter_channels, list)
+            assert len(inter_channels) == num_outs
+            self.inter_channels = inter_channels
+        self.stack_times = stack_times
+        self.paths = paths
+        assert isinstance(paths, list) and len(paths) == stack_times
+        for d in paths:
+            assert d in ('bu', 'td')
+
+        self.same_down_trans = same_down_trans
+        self.same_up_trans = same_up_trans
+        self.across_lateral_trans = across_lateral_trans
+        self.across_down_trans = across_down_trans
+        self.across_up_trans = across_up_trans
+        self.output_trans = output_trans
+        self.across_skip_trans = across_skip_trans
+
+        self.with_bias = norm_cfg is None
+        # skip inds must be specified if across skip trans is not None
+        if self.across_skip_trans is not None:
+            skip_inds is not None
+        self.skip_inds = skip_inds
+        assert len(self.skip_inds[0]) <= self.stack_times
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        # build lateral 1x1 convs to reduce channels
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = nn.Conv2d(self.in_channels[i],
+                               self.inter_channels[i - self.start_level], 1)
+            self.lateral_convs.append(l_conv)
+
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            if self.add_extra_convs:
+                fpn_idx = self.backbone_end_level - self.start_level + i
+                extra_conv = nn.Conv2d(
+                    self.inter_channels[fpn_idx - 1],
+                    self.inter_channels[fpn_idx],
+                    3,
+                    stride=2,
+                    padding=1)
+                self.extra_downsamples.append(extra_conv)
+            else:
+                self.extra_downsamples.append(nn.MaxPool2d(1, stride=2))
+
+        self.fpn_transitions = nn.ModuleList()  # stack times
+        for s in range(self.stack_times):
+            stage_trans = nn.ModuleList()  # num of feature levels
+            for i in range(self.num_outs):
+                # same, across_lateral, across_down, across_up
+                trans = nn.ModuleDict()
+                if s in self.skip_inds[i]:
+                    stage_trans.append(trans)
+                    continue
+                # build same-stage down trans (used in bottom-up paths)
+                if i == 0 or self.same_up_trans is None:
+                    same_up_trans = None
+                else:
+                    same_up_trans = self.build_trans(
+                        self.same_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['same_up'] = same_up_trans
+                # build same-stage up trans (used in top-down paths)
+                if i == self.num_outs - 1 or self.same_down_trans is None:
+                    same_down_trans = None
+                else:
+                    same_down_trans = self.build_trans(
+                        self.same_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['same_down'] = same_down_trans
+                # build across lateral trans
+                across_lateral_trans = self.build_trans(
+                    self.across_lateral_trans, self.inter_channels[i],
+                    self.inter_channels[i])
+                trans['across_lateral'] = across_lateral_trans
+                # build across down trans
+                if i == self.num_outs - 1 or self.across_down_trans is None:
+                    across_down_trans = None
+                else:
+                    across_down_trans = self.build_trans(
+                        self.across_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['across_down'] = across_down_trans
+                # build across up trans
+                if i == 0 or self.across_up_trans is None:
+                    across_up_trans = None
+                else:
+                    across_up_trans = self.build_trans(
+                        self.across_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_up'] = across_up_trans
+                if self.across_skip_trans is None:
+                    across_skip_trans = None
+                else:
+                    across_skip_trans = self.build_trans(
+                        self.across_skip_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_skip'] = across_skip_trans
+                # build across_skip trans
+                stage_trans.append(trans)
+            self.fpn_transitions.append(stage_trans)
+
+        self.output_transition = nn.ModuleList()  # output levels
+        for i in range(self.num_outs):
+            trans = self.build_trans(
+                self.output_trans,
+                self.inter_channels[i],
+                self.out_channels,
+                num_inputs=self.stack_times + 1)
+            self.output_transition.append(trans)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def build_trans(self, cfg, in_channels, out_channels, **extra_args):
+        cfg_ = cfg.copy()
+        trans_type = cfg_.pop('type')
+        trans_cls = self.transition_types[trans_type]
+        return trans_cls(in_channels, out_channels, **cfg_, **extra_args)
+
+    def fuse(self, fuse_dict):
+        out = None
+        for item in fuse_dict.values():
+            if item is not None:
+                if out is None:
+                    out = item
+                else:
+                    out = out + item
+        return out
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build all levels from original feature maps
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        outs = [feats]
+
+        for i in range(self.stack_times):
+            current_outs = outs[-1]
+            next_outs = []
+            direction = self.paths[i]
+            for j in range(self.num_outs):
+                if i in self.skip_inds[j]:
+                    next_outs.append(outs[-1][j])
+                    continue
+                # feature level
+                if direction == 'td':
+                    lvl = self.num_outs - j - 1
+                else:
+                    lvl = j
+                # get transitions
+                if direction == 'td':
+                    same_trans = self.fpn_transitions[i][lvl]['same_down']
+                else:
+                    same_trans = self.fpn_transitions[i][lvl]['same_up']
+                across_lateral_trans = self.fpn_transitions[i][lvl][
+                    'across_lateral']
+                across_down_trans = self.fpn_transitions[i][lvl]['across_down']
+                across_up_trans = self.fpn_transitions[i][lvl]['across_up']
+                across_skip_trans = self.fpn_transitions[i][lvl]['across_skip']
+                # init output
+                to_fuse = dict(
+                    same=None, lateral=None, across_up=None, across_down=None)
+                # same downsample/upsample
+                if same_trans is not None:
+                    to_fuse['same'] = same_trans(next_outs[-1])
+                # across lateral
+                if across_lateral_trans is not None:
+                    to_fuse['lateral'] = across_lateral_trans(
+                        current_outs[lvl])
+                # across downsample
+                if lvl > 0 and across_up_trans is not None:
+                    to_fuse['across_up'] = across_up_trans(current_outs[lvl -
+                                                                        1])
+                # across upsample
+                if (lvl < self.num_outs - 1 and across_down_trans is not None):
+                    to_fuse['across_down'] = across_down_trans(
+                        current_outs[lvl + 1])
+                if across_skip_trans is not None:
+                    to_fuse['across_skip'] = across_skip_trans(outs[0][lvl])
+                x = self.fuse(to_fuse)
+                next_outs.append(x)
+
+            if direction == 'td':
+                outs.append(next_outs[::-1])
+            else:
+                outs.append(next_outs)
+
+        # output trans
+        final_outs = []
+        for i in range(self.num_outs):
+            lvl_out_list = []
+            for s in range(len(outs)):
+                lvl_out_list.append(outs[s][i])
+            lvl_out = self.output_transition[i](lvl_out_list)
+            final_outs.append(lvl_out)
+
+        return final_outs
diff --git a/mmdet/models/necks/fpn.py b/mmdet/models/necks/fpn.py
new file mode 100755
index 0000000..4bdb5b2
--- /dev/null
+++ b/mmdet/models/necks/fpn.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmdet/models/necks/fpn_carafe.py b/mmdet/models/necks/fpn_carafe.py
new file mode 100755
index 0000000..fdd91f3
--- /dev/null
+++ b/mmdet/models/necks/fpn_carafe.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer, xavier_init
+from mmcv.ops.carafe import CARAFEPack
+from mmcv.runner import BaseModule, ModuleList
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN_CARAFE(BaseModule):
+    """FPN_CARAFE is a more flexible implementation of FPN. It allows more
+    choice for upsample methods during the top-down pathway.
+
+    It can reproduce the performance of ICCV 2019 paper
+    CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+
+    Args:
+        in_channels (list[int]): Number of channels for each input feature map.
+        out_channels (int): Output channels of feature pyramids.
+        num_outs (int): Number of output stages.
+        start_level (int): Start level of feature pyramids.
+            (Default: 0)
+        end_level (int): End level of feature pyramids.
+            (Default: -1 indicates the last level).
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        activate (str): Type of activation function in ConvModule
+            (Default: None indicates w/o activation).
+        order (dict): Order of components in ConvModule.
+        upsample (str): Type of upsample layer.
+        upsample_cfg (dict): Dictionary to construct and config upsample layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 order=('conv', 'norm', 'act'),
+                 upsample_cfg=dict(
+                     type='carafe',
+                     up_kernel=5,
+                     up_group=1,
+                     encoder_kernel=3,
+                     encoder_dilation=1),
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(FPN_CARAFE, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.with_bias = norm_cfg is None
+        self.upsample_cfg = upsample_cfg.copy()
+        self.upsample = self.upsample_cfg.get('type')
+        self.relu = nn.ReLU(inplace=False)
+
+        self.order = order
+        assert order in [('conv', 'norm', 'act'), ('act', 'conv', 'norm')]
+
+        assert self.upsample in [
+            'nearest', 'bilinear', 'deconv', 'pixel_shuffle', 'carafe', None
+        ]
+        if self.upsample in ['deconv', 'pixel_shuffle']:
+            assert hasattr(
+                self.upsample_cfg,
+                'upsample_kernel') and self.upsample_cfg.upsample_kernel > 0
+            self.upsample_kernel = self.upsample_cfg.pop('upsample_kernel')
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        self.lateral_convs = ModuleList()
+        self.fpn_convs = ModuleList()
+        self.upsample_modules = ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            if i != self.backbone_end_level - 1:
+                upsample_cfg_ = self.upsample_cfg.copy()
+                if self.upsample == 'deconv':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsample_cfg_.update(channels=out_channels, scale_factor=2)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsample_cfg_.update(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsample_module = build_upsample_layer(upsample_cfg_)
+                self.upsample_modules.append(upsample_module)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_out_levels = (
+            num_outs - self.backbone_end_level + self.start_level)
+        if extra_out_levels >= 1:
+            for i in range(extra_out_levels):
+                in_channels = (
+                    self.in_channels[self.backbone_end_level -
+                                     1] if i == 0 else out_channels)
+                extra_l_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                if self.upsample == 'deconv':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsampler_cfg_ = dict(
+                        channels=out_channels,
+                        scale_factor=2,
+                        **self.upsample_cfg)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsampler_cfg_ = dict(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsampler_cfg_['type'] = self.upsample
+                upsample_module = build_upsample_layer(upsampler_cfg_)
+                extra_fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                self.upsample_modules.append(upsample_module)
+                self.fpn_convs.append(extra_fpn_conv)
+                self.lateral_convs.append(extra_l_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(FPN_CARAFE, self).init_weights()
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                xavier_init(m, distribution='uniform')
+        for m in self.modules():
+            if isinstance(m, CARAFEPack):
+                m.init_weights()
+
+    def slice_as(self, src, dst):
+        """Slice ``src`` as ``dst``
+
+        Note:
+            ``src`` should have the same or larger size than ``dst``.
+
+        Args:
+            src (torch.Tensor): Tensors to be sliced.
+            dst (torch.Tensor): ``src`` will be sliced to have the same
+                size as ``dst``.
+
+        Returns:
+            torch.Tensor: Sliced tensor.
+        """
+        assert (src.size(2) >= dst.size(2)) and (src.size(3) >= dst.size(3))
+        if src.size(2) == dst.size(2) and src.size(3) == dst.size(3):
+            return src
+        else:
+            return src[:, :, :dst.size(2), :dst.size(3)]
+
+    def tensor_add(self, a, b):
+        """Add tensors ``a`` and ``b`` that might have different sizes."""
+        if a.size() == b.size():
+            c = a + b
+        else:
+            c = a + self.slice_as(b, a)
+        return c
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = []
+        for i, lateral_conv in enumerate(self.lateral_convs):
+            if i <= self.backbone_end_level - self.start_level:
+                input = inputs[min(i + self.start_level, len(inputs) - 1)]
+            else:
+                input = laterals[-1]
+            lateral = lateral_conv(input)
+            laterals.append(lateral)
+
+        # build top-down path
+        for i in range(len(laterals) - 1, 0, -1):
+            if self.upsample is not None:
+                upsample_feat = self.upsample_modules[i - 1](laterals[i])
+            else:
+                upsample_feat = laterals[i]
+            laterals[i - 1] = self.tensor_add(laterals[i - 1], upsample_feat)
+
+        # build outputs
+        num_conv_outs = len(self.fpn_convs)
+        outs = []
+        for i in range(num_conv_outs):
+            out = self.fpn_convs[i](laterals[i])
+            outs.append(out)
+        return tuple(outs)
diff --git a/mmdet/models/necks/hrfpn.py b/mmdet/models/necks/hrfpn.py
new file mode 100755
index 0000000..ca15be6
--- /dev/null
+++ b/mmdet/models/necks/hrfpn.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch.utils.checkpoint import checkpoint
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class HRFPN(BaseModule):
+    """HRFPN (High Resolution Feature Pyramids)
+
+    paper: `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        in_channels (list): number of channels for each branch.
+        out_channels (int): output channels of feature pyramids.
+        num_outs (int): number of output stages.
+        pooling_type (str): pooling for generating feature pyramids
+            from {MAX, AVG}.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp  (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        stride (int): stride of 3x3 convolutional layers
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs=5,
+                 pooling_type='AVG',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 with_cp=False,
+                 stride=1,
+                 init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')):
+        super(HRFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reduction_conv = ConvModule(
+            sum(in_channels),
+            out_channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            act_cfg=None)
+
+        self.fpn_convs = nn.ModuleList()
+        for i in range(self.num_outs):
+            self.fpn_convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    act_cfg=None))
+
+        if pooling_type == 'MAX':
+            self.pooling = F.max_pool2d
+        else:
+            self.pooling = F.avg_pool2d
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == self.num_ins
+        outs = [inputs[0]]
+        for i in range(1, self.num_ins):
+            outs.append(
+                F.interpolate(inputs[i], scale_factor=2**i, mode='bilinear'))
+        out = torch.cat(outs, dim=1)
+        if out.requires_grad and self.with_cp:
+            out = checkpoint(self.reduction_conv, out)
+        else:
+            out = self.reduction_conv(out)
+        outs = [out]
+        for i in range(1, self.num_outs):
+            outs.append(self.pooling(out, kernel_size=2**i, stride=2**i))
+        outputs = []
+
+        for i in range(self.num_outs):
+            if outs[i].requires_grad and self.with_cp:
+                tmp_out = checkpoint(self.fpn_convs[i], outs[i])
+            else:
+                tmp_out = self.fpn_convs[i](outs[i])
+            outputs.append(tmp_out)
+        return tuple(outputs)
diff --git a/mmdet/models/necks/nas_fpn.py b/mmdet/models/necks/nas_fpn.py
new file mode 100755
index 0000000..710592e
--- /dev/null
+++ b/mmdet/models/necks/nas_fpn.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops.merge_cells import GlobalPoolingCell, SumCell
+from mmcv.runner import BaseModule, ModuleList
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class NASFPN(BaseModule):
+    """NAS-FPN.
+
+    Implementation of `NAS-FPN: Learning Scalable Feature Pyramid Architecture
+    for Object Detection <https://arxiv.org/abs/1904.07392>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 stack_times,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 norm_cfg=None,
+                 init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')):
+        super(NASFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)  # num of input feature levels
+        self.num_outs = num_outs  # num of output feature levels
+        self.stack_times = stack_times
+        self.norm_cfg = norm_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        # add lateral connections
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            self.lateral_convs.append(l_conv)
+
+        # add extra downsample layers (stride-2 pooling or conv)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_conv = ConvModule(
+                out_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+            self.extra_downsamples.append(
+                nn.Sequential(extra_conv, nn.MaxPool2d(2, 2)))
+
+        # add NAS FPN connections
+        self.fpn_stages = ModuleList()
+        for _ in range(self.stack_times):
+            stage = nn.ModuleDict()
+            # gp(p6, p4) -> p4_1
+            stage['gp_64_4'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_1, p4) -> p4_2
+            stage['sum_44_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_2, p3) -> p3_out
+            stage['sum_43_3'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p3_out, p4_2) -> p4_out
+            stage['sum_34_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            stage['gp_43_5'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_55_5'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            stage['gp_54_7'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_77_7'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # gp(p7_out, p5_out) -> p6_out
+            stage['gp_75_6'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            self.fpn_stages.append(stage)
+
+    def forward(self, inputs):
+        """Forward function."""
+        # build P3-P5
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # build P6-P7 on top of P5
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        p3, p4, p5, p6, p7 = feats
+
+        for stage in self.fpn_stages:
+            # gp(p6, p4) -> p4_1
+            p4_1 = stage['gp_64_4'](p6, p4, out_size=p4.shape[-2:])
+            # sum(p4_1, p4) -> p4_2
+            p4_2 = stage['sum_44_4'](p4_1, p4, out_size=p4.shape[-2:])
+            # sum(p4_2, p3) -> p3_out
+            p3 = stage['sum_43_3'](p4_2, p3, out_size=p3.shape[-2:])
+            # sum(p3_out, p4_2) -> p4_out
+            p4 = stage['sum_34_4'](p3, p4_2, out_size=p4.shape[-2:])
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            p5_tmp = stage['gp_43_5'](p4, p3, out_size=p5.shape[-2:])
+            p5 = stage['sum_55_5'](p5, p5_tmp, out_size=p5.shape[-2:])
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            p7_tmp = stage['gp_54_7'](p5, p4_2, out_size=p7.shape[-2:])
+            p7 = stage['sum_77_7'](p7, p7_tmp, out_size=p7.shape[-2:])
+            # gp(p7_out, p5_out) -> p6_out
+            p6 = stage['gp_75_6'](p7, p5, out_size=p6.shape[-2:])
+
+        return p3, p4, p5, p6, p7
diff --git a/mmdet/models/necks/nasfcos_fpn.py b/mmdet/models/necks/nasfcos_fpn.py
new file mode 100755
index 0000000..c4abfe7
--- /dev/null
+++ b/mmdet/models/necks/nasfcos_fpn.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, caffe2_xavier_init
+from mmcv.ops.merge_cells import ConcatCell
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class NASFCOS_FPN(BaseModule):
+    """FPN structure in NASFPN.
+
+    Implementation of paper `NAS-FCOS: Fast Neural Architecture Search for
+    Object Detection <https://arxiv.org/abs/1906.04423>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=1,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(NASFCOS_FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        self.adapt_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            adapt_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=False,
+                norm_cfg=dict(type='BN'),
+                act_cfg=dict(type='ReLU', inplace=False))
+            self.adapt_convs.append(adapt_conv)
+
+        # C2 is omitted according to the paper
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+
+        def build_concat_cell(with_input1_conv, with_input2_conv):
+            cell_conv_cfg = dict(
+                kernel_size=1, padding=0, bias=False, groups=out_channels)
+            return ConcatCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                with_out_conv=True,
+                out_conv_cfg=cell_conv_cfg,
+                out_norm_cfg=dict(type='BN'),
+                out_conv_order=('norm', 'act', 'conv'),
+                with_input1_conv=with_input1_conv,
+                with_input2_conv=with_input2_conv,
+                input_conv_cfg=conv_cfg,
+                input_norm_cfg=norm_cfg,
+                upsample_mode='nearest')
+
+        # Denote c3=f0, c4=f1, c5=f2 for convince
+        self.fpn = nn.ModuleDict()
+        self.fpn['c22_1'] = build_concat_cell(True, True)
+        self.fpn['c22_2'] = build_concat_cell(True, True)
+        self.fpn['c32'] = build_concat_cell(True, False)
+        self.fpn['c02'] = build_concat_cell(True, False)
+        self.fpn['c42'] = build_concat_cell(True, True)
+        self.fpn['c36'] = build_concat_cell(True, True)
+        self.fpn['c61'] = build_concat_cell(True, True)  # f9
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_act_cfg = None if i == 0 \
+                else dict(type='ReLU', inplace=False)
+            self.extra_downsamples.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    act_cfg=extra_act_cfg,
+                    order=('act', 'norm', 'conv')))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = [
+            adapt_conv(inputs[i + self.start_level])
+            for i, adapt_conv in enumerate(self.adapt_convs)
+        ]
+
+        for (i, module_name) in enumerate(self.fpn):
+            idx_1, idx_2 = int(module_name[1]), int(module_name[2])
+            res = self.fpn[module_name](feats[idx_1], feats[idx_2])
+            feats.append(res)
+
+        ret = []
+        for (idx, input_idx) in zip([9, 8, 7], [1, 2, 3]):  # add P3, P4, P5
+            feats1, feats2 = feats[idx], feats[5]
+            feats2_resize = F.interpolate(
+                feats2,
+                size=feats1.size()[2:],
+                mode='bilinear',
+                align_corners=False)
+
+            feats_sum = feats1 + feats2_resize
+            ret.append(
+                F.interpolate(
+                    feats_sum,
+                    size=inputs[input_idx].size()[2:],
+                    mode='bilinear',
+                    align_corners=False))
+
+        for submodule in self.extra_downsamples:
+            ret.append(submodule(ret[-1]))
+
+        return tuple(ret)
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(NASFCOS_FPN, self).init_weights()
+        for module in self.fpn.values():
+            if hasattr(module, 'conv_out'):
+                caffe2_xavier_init(module.out_conv.conv)
+
+        for modules in [
+                self.adapt_convs.modules(),
+                self.extra_downsamples.modules()
+        ]:
+            for module in modules:
+                if isinstance(module, nn.Conv2d):
+                    caffe2_xavier_init(module)
diff --git a/mmdet/models/necks/pafpn.py b/mmdet/models/necks/pafpn.py
new file mode 100755
index 0000000..2edd348
--- /dev/null
+++ b/mmdet/models/necks/pafpn.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import auto_fp16
+
+from ..builder import NECKS
+from .fpn import FPN
+
+
+@NECKS.register_module()
+class PAFPN(FPN):
+    """Path Aggregation Network for Instance Segmentation.
+
+    This is an implementation of the `PAFPN in Path Aggregation Network
+    <https://arxiv.org/abs/1803.01534>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(PAFPN, self).__init__(
+            in_channels,
+            out_channels,
+            num_outs,
+            start_level,
+            end_level,
+            add_extra_convs,
+            relu_before_extra_convs,
+            no_norm_on_lateral,
+            conv_cfg,
+            norm_cfg,
+            act_cfg,
+            init_cfg=init_cfg)
+        # add extra bottom up pathway
+        self.downsample_convs = nn.ModuleList()
+        self.pafpn_convs = nn.ModuleList()
+        for i in range(self.start_level + 1, self.backbone_end_level):
+            d_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            pafpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.downsample_convs.append(d_conv)
+            self.pafpn_convs.append(pafpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            # fix runtime error of "+=" inplace operation in PyTorch 1.10
+            laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                laterals[i], size=prev_shape, mode='nearest')
+
+        # build outputs
+        # part 1: from original levels
+        inter_outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+
+        # part 2: add bottom-up path
+        for i in range(0, used_backbone_levels - 1):
+            inter_outs[i + 1] += self.downsample_convs[i](inter_outs[i])
+
+        outs = []
+        outs.append(inter_outs[0])
+        outs.extend([
+            self.pafpn_convs[i - 1](inter_outs[i])
+            for i in range(1, used_backbone_levels)
+        ])
+
+        # part 3: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    orig = inputs[self.backbone_end_level - 1]
+                    outs.append(self.fpn_convs[used_backbone_levels](orig))
+                elif self.add_extra_convs == 'on_lateral':
+                    outs.append(self.fpn_convs[used_backbone_levels](
+                        laterals[-1]))
+                elif self.add_extra_convs == 'on_output':
+                    outs.append(self.fpn_convs[used_backbone_levels](outs[-1]))
+                else:
+                    raise NotImplementedError
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmdet/models/necks/rfp.py b/mmdet/models/necks/rfp.py
new file mode 100755
index 0000000..6976f4d
--- /dev/null
+++ b/mmdet/models/necks/rfp.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.runner import BaseModule, ModuleList
+
+from ..builder import NECKS, build_backbone
+from .fpn import FPN
+
+
+class ASPP(BaseModule):
+    """ASPP (Atrous Spatial Pyramid Pooling)
+
+    This is an implementation of the ASPP module used in DetectoRS
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of channels produced by this module
+        dilations (tuple[int]): Dilations of the four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilations=(1, 3, 6, 1),
+                 init_cfg=dict(type='Kaiming', layer='Conv2d')):
+        super().__init__(init_cfg)
+        assert dilations[-1] == 1
+        self.aspp = nn.ModuleList()
+        for dilation in dilations:
+            kernel_size = 3 if dilation > 1 else 1
+            padding = dilation if dilation > 1 else 0
+            conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                dilation=dilation,
+                padding=padding,
+                bias=True)
+            self.aspp.append(conv)
+        self.gap = nn.AdaptiveAvgPool2d(1)
+
+    def forward(self, x):
+        avg_x = self.gap(x)
+        out = []
+        for aspp_idx in range(len(self.aspp)):
+            inp = avg_x if (aspp_idx == len(self.aspp) - 1) else x
+            out.append(F.relu_(self.aspp[aspp_idx](inp)))
+        out[-1] = out[-1].expand_as(out[-2])
+        out = torch.cat(out, dim=1)
+        return out
+
+
+@NECKS.register_module()
+class RFP(FPN):
+    """RFP (Recursive Feature Pyramid)
+
+    This is an implementation of RFP in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_. Different from standard FPN, the
+    input of RFP should be multi level features along with origin input image
+    of backbone.
+
+    Args:
+        rfp_steps (int): Number of unrolled steps of RFP.
+        rfp_backbone (dict): Configuration of the backbone for RFP.
+        aspp_out_channels (int): Number of output channels of ASPP module.
+        aspp_dilations (tuple[int]): Dilation rates of four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 rfp_steps,
+                 rfp_backbone,
+                 aspp_out_channels,
+                 aspp_dilations=(1, 3, 6, 1),
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        self.rfp_steps = rfp_steps
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.rfp_modules = ModuleList()
+        for rfp_idx in range(1, rfp_steps):
+            rfp_module = build_backbone(rfp_backbone)
+            self.rfp_modules.append(rfp_module)
+        self.rfp_aspp = ASPP(self.out_channels, aspp_out_channels,
+                             aspp_dilations)
+        self.rfp_weight = nn.Conv2d(
+            self.out_channels,
+            1,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def init_weights(self):
+        # Avoid using super().init_weights(), which may alter the default
+        # initialization of the modules in self.rfp_modules that have missing
+        # keys in the pretrained checkpoint.
+        for convs in [self.lateral_convs, self.fpn_convs]:
+            for m in convs.modules():
+                if isinstance(m, nn.Conv2d):
+                    xavier_init(m, distribution='uniform')
+        for rfp_idx in range(self.rfp_steps - 1):
+            self.rfp_modules[rfp_idx].init_weights()
+        constant_init(self.rfp_weight, 0)
+
+    def forward(self, inputs):
+        inputs = list(inputs)
+        assert len(inputs) == len(self.in_channels) + 1  # +1 for input image
+        img = inputs.pop(0)
+        # FPN forward
+        x = super().forward(tuple(inputs))
+        for rfp_idx in range(self.rfp_steps - 1):
+            rfp_feats = [x[0]] + list(
+                self.rfp_aspp(x[i]) for i in range(1, len(x)))
+            x_idx = self.rfp_modules[rfp_idx].rfp_forward(img, rfp_feats)
+            # FPN forward
+            x_idx = super().forward(x_idx)
+            x_new = []
+            for ft_idx in range(len(x_idx)):
+                add_weight = torch.sigmoid(self.rfp_weight(x_idx[ft_idx]))
+                x_new.append(add_weight * x_idx[ft_idx] +
+                             (1 - add_weight) * x[ft_idx])
+            x = x_new
+        return x
diff --git a/mmdet/models/necks/ssd_neck.py b/mmdet/models/necks/ssd_neck.py
new file mode 100755
index 0000000..179d575
--- /dev/null
+++ b/mmdet/models/necks/ssd_neck.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class SSDNeck(BaseModule):
+    """Extra layers of SSD backbone to generate multi-scale feature maps.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (Sequence[int]): Number of output channels per scale.
+        level_strides (Sequence[int]): Stride of 3x3 conv per level.
+        level_paddings (Sequence[int]): Padding size of 3x3 conv per level.
+        l2_norm_scale (float|None): L2 normalization layer init scale.
+            If None, not use L2 normalization on the first input feature.
+        last_kernel_size (int): Kernel size of the last conv layer.
+            Default: 3.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 level_strides,
+                 level_paddings,
+                 l2_norm_scale=20.,
+                 last_kernel_size=3,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(
+                         type='Xavier', distribution='uniform',
+                         layer='Conv2d'),
+                     dict(type='Constant', val=1, layer='BatchNorm2d'),
+                 ]):
+        super(SSDNeck, self).__init__(init_cfg)
+        assert len(out_channels) > len(in_channels)
+        assert len(out_channels) - len(in_channels) == len(level_strides)
+        assert len(level_strides) == len(level_paddings)
+        assert in_channels == out_channels[:len(in_channels)]
+
+        if l2_norm_scale:
+            self.l2_norm = L2Norm(in_channels[0], l2_norm_scale)
+            self.init_cfg += [
+                dict(
+                    type='Constant',
+                    val=self.l2_norm.scale,
+                    override=dict(name='l2_norm'))
+            ]
+
+        self.extra_layers = nn.ModuleList()
+        extra_layer_channels = out_channels[len(in_channels):]
+        second_conv = DepthwiseSeparableConvModule if \
+            use_depthwise else ConvModule
+
+        for i, (out_channel, stride, padding) in enumerate(
+                zip(extra_layer_channels, level_strides, level_paddings)):
+            kernel_size = last_kernel_size \
+                if i == len(extra_layer_channels) - 1 else 3
+            per_lvl_convs = nn.Sequential(
+                ConvModule(
+                    out_channels[len(in_channels) - 1 + i],
+                    out_channel // 2,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+                second_conv(
+                    out_channel // 2,
+                    out_channel,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.extra_layers.append(per_lvl_convs)
+
+    def forward(self, inputs):
+        """Forward function."""
+        outs = [feat for feat in inputs]
+        if hasattr(self, 'l2_norm'):
+            outs[0] = self.l2_norm(outs[0])
+
+        feat = outs[-1]
+        for layer in self.extra_layers:
+            feat = layer(feat)
+            outs.append(feat)
+        return tuple(outs)
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_dims, scale=20., eps=1e-10):
+        """L2 normalization layer.
+
+        Args:
+            n_dims (int): Number of dimensions to be normalized
+            scale (float, optional): Defaults to 20..
+            eps (float, optional): Used to avoid division by zero.
+                Defaults to 1e-10.
+        """
+        super(L2Norm, self).__init__()
+        self.n_dims = n_dims
+        self.weight = nn.Parameter(torch.Tensor(self.n_dims))
+        self.eps = eps
+        self.scale = scale
+
+    def forward(self, x):
+        """Forward function."""
+        # normalization layer convert to FP32 in FP16 training
+        x_float = x.float()
+        norm = x_float.pow(2).sum(1, keepdim=True).sqrt() + self.eps
+        return (self.weight[None, :, None, None].float().expand_as(x_float) *
+                x_float / norm).type_as(x)
diff --git a/mmdet/models/necks/yolo_neck.py b/mmdet/models/necks/yolo_neck.py
new file mode 100755
index 0000000..c8eeb57
--- /dev/null
+++ b/mmdet/models/necks/yolo_neck.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+
+
+class DetectionBlock(BaseModule):
+    """Detection block in YOLO neck.
+
+    Let out_channels = n, the DetectionBlock contains:
+    Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer.
+    The first 6 ConvLayers are formed the following way:
+        1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n.
+    The Conv2D layer is 1x1x255.
+    Some block will have branch after the fifth ConvLayer.
+    The input channel is arbitrary (in_channels)
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 init_cfg=None):
+        super(DetectionBlock, self).__init__(init_cfg)
+        double_out_channels = out_channels * 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+        self.conv4 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+
+    def forward(self, x):
+        tmp = self.conv1(x)
+        tmp = self.conv2(tmp)
+        tmp = self.conv3(tmp)
+        tmp = self.conv4(tmp)
+        out = self.conv5(tmp)
+        return out
+
+
+@NECKS.register_module()
+class YOLOV3Neck(BaseModule):
+    """The neck of YOLOV3.
+
+    It can be treated as a simplified version of FPN. It
+    will take the result from Darknet backbone and do some upsampling and
+    concatenation. It will finally output the detection result.
+
+    Note:
+        The input feats should be from top to bottom.
+            i.e., from high-lvl to low-lvl
+        But YOLOV3Neck will process them in reversed order.
+            i.e., from bottom (high-lvl) to top (low-lvl)
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (List[int]): The number of input channels per scale.
+        out_channels (List[int]): The number of output channels  per scale.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Dictionary to construct and config norm
+            layer. Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict, optional): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_scales,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 init_cfg=None):
+        super(YOLOV3Neck, self).__init__(init_cfg)
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        # To support arbitrary scales, the code looks awful, but it works.
+        # Better solution is welcomed.
+        self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg)
+        for i in range(1, self.num_scales):
+            in_c, out_c = self.in_channels[i], self.out_channels[i]
+            inter_c = out_channels[i - 1]
+            self.add_module(f'conv{i}', ConvModule(inter_c, out_c, 1, **cfg))
+            # in_c + out_c : High-lvl feats will be cat with low-lvl feats
+            self.add_module(f'detect{i+1}',
+                            DetectionBlock(in_c + out_c, out_c, **cfg))
+
+    def forward(self, feats):
+        assert len(feats) == self.num_scales
+
+        # processed from bottom (high-lvl) to top (low-lvl)
+        outs = []
+        out = self.detect1(feats[-1])
+        outs.append(out)
+
+        for i, x in enumerate(reversed(feats[:-1])):
+            conv = getattr(self, f'conv{i+1}')
+            tmp = conv(out)
+
+            # Cat with low-lvl feats
+            tmp = F.interpolate(tmp, scale_factor=2)
+            tmp = torch.cat((tmp, x), 1)
+
+            detect = getattr(self, f'detect{i+2}')
+            out = detect(tmp)
+            outs.append(out)
+
+        return tuple(outs)
diff --git a/mmdet/models/necks/yolox_pafpn.py b/mmdet/models/necks/yolox_pafpn.py
new file mode 100755
index 0000000..b0f6f70
--- /dev/null
+++ b/mmdet/models/necks/yolox_pafpn.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+from ..utils import CSPLayer
+
+
+@NECKS.register_module()
+class YOLOXPAFPN(BaseModule):
+    """Path Aggregation Network used in YOLOX.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_csp_blocks=3,
+                 use_depthwise=False,
+                 upsample_cfg=dict(scale_factor=2, mode='nearest'),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super(YOLOXPAFPN, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                ConvModule(
+                    in_channels[i],
+                    out_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/mmdet/models/plugins/__init__.py b/mmdet/models/plugins/__init__.py
new file mode 100755
index 0000000..a455c07
--- /dev/null
+++ b/mmdet/models/plugins/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dropblock import DropBlock
+from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
+from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder
+
+__all__ = [
+    'DropBlock', 'PixelDecoder', 'TransformerEncoderPixelDecoder',
+    'MSDeformAttnPixelDecoder'
+]
diff --git a/mmdet/models/plugins/dropblock.py b/mmdet/models/plugins/dropblock.py
new file mode 100755
index 0000000..bb00ade
--- /dev/null
+++ b/mmdet/models/plugins/dropblock.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import PLUGIN_LAYERS
+
+eps = 1e-6
+
+
+@PLUGIN_LAYERS.register_module()
+class DropBlock(nn.Module):
+    """Randomly drop some regions of feature maps.
+
+     Please refer to the method proposed in `DropBlock
+     <https://arxiv.org/abs/1810.12890>`_ for details.
+
+    Args:
+        drop_prob (float): The probability of dropping each block.
+        block_size (int): The size of dropped blocks.
+        warmup_iters (int): The drop probability will linearly increase
+            from `0` to `drop_prob` during the first `warmup_iters` iterations.
+            Default: 2000.
+    """
+
+    def __init__(self, drop_prob, block_size, warmup_iters=2000, **kwargs):
+        super(DropBlock, self).__init__()
+        assert block_size % 2 == 1
+        assert 0 < drop_prob <= 1
+        assert warmup_iters >= 0
+        self.drop_prob = drop_prob
+        self.block_size = block_size
+        self.warmup_iters = warmup_iters
+        self.iter_cnt = 0
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Input feature map on which some areas will be randomly
+                dropped.
+
+        Returns:
+            Tensor: The tensor after DropBlock layer.
+        """
+        if not self.training:
+            return x
+        self.iter_cnt += 1
+        N, C, H, W = list(x.shape)
+        gamma = self._compute_gamma((H, W))
+        mask_shape = (N, C, H - self.block_size + 1, W - self.block_size + 1)
+        mask = torch.bernoulli(torch.full(mask_shape, gamma, device=x.device))
+
+        mask = F.pad(mask, [self.block_size // 2] * 4, value=0)
+        mask = F.max_pool2d(
+            input=mask,
+            stride=(1, 1),
+            kernel_size=(self.block_size, self.block_size),
+            padding=self.block_size // 2)
+        mask = 1 - mask
+        x = x * mask * mask.numel() / (eps + mask.sum())
+        return x
+
+    def _compute_gamma(self, feat_size):
+        """Compute the value of gamma according to paper. gamma is the
+        parameter of bernoulli distribution, which controls the number of
+        features to drop.
+
+        gamma = (drop_prob * fm_area) / (drop_area * keep_area)
+
+        Args:
+            feat_size (tuple[int, int]): The height and width of feature map.
+
+        Returns:
+            float: The value of gamma.
+        """
+        gamma = (self.drop_prob * feat_size[0] * feat_size[1])
+        gamma /= ((feat_size[0] - self.block_size + 1) *
+                  (feat_size[1] - self.block_size + 1))
+        gamma /= (self.block_size**2)
+        factor = (1.0 if self.iter_cnt > self.warmup_iters else self.iter_cnt /
+                  self.warmup_iters)
+        return gamma * factor
+
+    def extra_repr(self):
+        return (f'drop_prob={self.drop_prob}, block_size={self.block_size}, '
+                f'warmup_iters={self.warmup_iters}')
diff --git a/mmdet/models/plugins/msdeformattn_pixel_decoder.py b/mmdet/models/plugins/msdeformattn_pixel_decoder.py
new file mode 100755
index 0000000..d553582
--- /dev/null
+++ b/mmdet/models/plugins/msdeformattn_pixel_decoder.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (PLUGIN_LAYERS, Conv2d, ConvModule, caffe2_xavier_init,
+                      normal_init, xavier_init)
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.runner import BaseModule, ModuleList
+
+from mmdet.core.anchor import MlvlPointGenerator
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+
+
+@PLUGIN_LAYERS.register_module()
+class MSDeformAttnPixelDecoder(BaseModule):
+    """Pixel decoder with multi-scale deformable attention.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        strides (list[int] | tuple[int]): Output strides of feature from
+            backbone.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_outs (int): Number of output scales.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transformer
+            encoder. Defaults to `DetrTransformerEncoder`.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`mmcv.ConfigDict` | dict): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels=[256, 512, 1024, 2048],
+                 strides=[4, 8, 16, 32],
+                 feat_channels=256,
+                 out_channels=256,
+                 num_outs=3,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='ReLU'),
+                 encoder=dict(
+                     type='DetrTransformerEncoder',
+                     num_layers=6,
+                     transformerlayers=dict(
+                         type='BaseTransformerLayer',
+                         attn_cfgs=dict(
+                             type='MultiScaleDeformableAttention',
+                             embed_dims=256,
+                             num_heads=8,
+                             num_levels=3,
+                             num_points=4,
+                             im2col_step=64,
+                             dropout=0.0,
+                             batch_first=False,
+                             norm_cfg=None,
+                             init_cfg=None),
+                         feedforward_channels=1024,
+                         ffn_dropout=0.0,
+                         operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                     init_cfg=None),
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.strides = strides
+        self.num_input_levels = len(in_channels)
+        self.num_encoder_levels = \
+            encoder.transformerlayers.attn_cfgs.num_levels
+        assert self.num_encoder_levels >= 1, \
+            'num_levels in attn_cfgs must be at least one'
+        input_conv_list = []
+        # from top to down (low to high resolution)
+        for i in range(self.num_input_levels - 1,
+                       self.num_input_levels - self.num_encoder_levels - 1,
+                       -1):
+            input_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+            input_conv_list.append(input_conv)
+        self.input_convs = ModuleList(input_conv_list)
+
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.postional_encoding = build_positional_encoding(
+            positional_encoding)
+        # high resolution to low resolution
+        self.level_encoding = nn.Embedding(self.num_encoder_levels,
+                                           feat_channels)
+
+        # fpn-like structure
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        # from top to down (low to high resolution)
+        # fpn for the rest features that didn't pass in encoder
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.num_outs = num_outs
+        self.point_generator = MlvlPointGenerator(strides)
+
+    def init_weights(self):
+        """Initialize weights."""
+        for i in range(0, self.num_encoder_levels):
+            xavier_init(
+                self.input_convs[i].conv,
+                gain=1,
+                bias=0,
+                distribution='uniform')
+
+        for i in range(0, self.num_input_levels - self.num_encoder_levels):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+
+        normal_init(self.level_encoding, mean=0, std=1)
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+        # init_weights defined in MultiScaleDeformableAttention
+        for layer in self.encoder.layers:
+            for attn in layer.attentions:
+                if isinstance(attn, MultiScaleDeformableAttention):
+                    attn.init_weights()
+
+    def forward(self, feats):
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+
+        Returns:
+            tuple: A tuple containing the following:
+
+            - mask_feature (Tensor): shape (batch_size, c, h, w).
+            - multi_scale_features (list[Tensor]): Multi scale \
+                    features, each in shape (batch_size, c, h, w).
+        """
+        # generate padding mask for each level, for each image
+        batch_size = feats[0].shape[0]
+        encoder_input_list = []
+        padding_mask_list = []
+        level_positional_encoding_list = []
+        spatial_shapes = []
+        reference_points_list = []
+        for i in range(self.num_encoder_levels):
+            level_idx = self.num_input_levels - i - 1
+            feat = feats[level_idx]
+            feat_projected = self.input_convs[i](feat)
+            h, w = feat.shape[-2:]
+
+            # no padding
+            padding_mask_resized = feat.new_zeros(
+                (batch_size, ) + feat.shape[-2:], dtype=torch.bool)
+            pos_embed = self.postional_encoding(padding_mask_resized)
+            level_embed = self.level_encoding.weight[i]
+            level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
+            # (h_i * w_i, 2)
+            reference_points = self.point_generator.single_level_grid_priors(
+                feat.shape[-2:], level_idx, device=feat.device)
+            # normalize
+            factor = feat.new_tensor([[w, h]]) * self.strides[level_idx]
+            reference_points = reference_points / factor
+
+            # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
+            feat_projected = feat_projected.flatten(2).permute(2, 0, 1)
+            level_pos_embed = level_pos_embed.flatten(2).permute(2, 0, 1)
+            padding_mask_resized = padding_mask_resized.flatten(1)
+
+            encoder_input_list.append(feat_projected)
+            padding_mask_list.append(padding_mask_resized)
+            level_positional_encoding_list.append(level_pos_embed)
+            spatial_shapes.append(feat.shape[-2:])
+            reference_points_list.append(reference_points)
+        # shape (batch_size, total_num_query),
+        # total_num_query=sum([., h_i * w_i,.])
+        padding_masks = torch.cat(padding_mask_list, dim=1)
+        # shape (total_num_query, batch_size, c)
+        encoder_inputs = torch.cat(encoder_input_list, dim=0)
+        level_positional_encodings = torch.cat(
+            level_positional_encoding_list, dim=0)
+        device = encoder_inputs.device
+        # shape (num_encoder_levels, 2), from low
+        # resolution to high resolution
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=device)
+        # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        reference_points = torch.cat(reference_points_list, dim=0)
+        reference_points = reference_points[None, :, None].repeat(
+            batch_size, 1, self.num_encoder_levels, 1)
+        valid_radios = reference_points.new_ones(
+            (batch_size, self.num_encoder_levels, 2))
+        # shape (num_total_query, batch_size, c)
+        memory = self.encoder(
+            query=encoder_inputs,
+            key=None,
+            value=None,
+            query_pos=level_positional_encodings,
+            key_pos=None,
+            attn_masks=None,
+            key_padding_mask=None,
+            query_key_padding_mask=padding_masks,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_radios=valid_radios)
+        # (num_total_query, batch_size, c) -> (batch_size, c, num_total_query)
+        memory = memory.permute(1, 2, 0)
+
+        # from low resolution to high resolution
+        num_query_per_level = [e[0] * e[1] for e in spatial_shapes]
+        outs = torch.split(memory, num_query_per_level, dim=-1)
+        outs = [
+            x.reshape(batch_size, -1, spatial_shapes[i][0],
+                      spatial_shapes[i][1]) for i, x in enumerate(outs)
+        ]
+
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + F.interpolate(
+                outs[-1],
+                size=cur_feat.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            y = self.output_convs[i](y)
+            outs.append(y)
+        multi_scale_features = outs[:self.num_outs]
+
+        mask_feature = self.mask_feature(outs[-1])
+        return mask_feature, multi_scale_features
diff --git a/mmdet/models/plugins/pixel_decoder.py b/mmdet/models/plugins/pixel_decoder.py
new file mode 100755
index 0000000..537a187
--- /dev/null
+++ b/mmdet/models/plugins/pixel_decoder.py
@@ -0,0 +1,243 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import PLUGIN_LAYERS, Conv2d, ConvModule, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.runner import BaseModule, ModuleList
+
+
+@PLUGIN_LAYERS.register_module()
+class PixelDecoder(BaseModule):
+    """Pixel decoder with a structure like fpn.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transorformer
+            encoder.Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`mmcv.ConfigDict` | dict):  Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_inputs = len(in_channels)
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        for i in range(0, self.num_inputs - 1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.last_feat_conv = ConvModule(
+            in_channels[-1],
+            feat_channels,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def init_weights(self):
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.last_feat_conv, bias=0)
+
+    def forward(self, feats, img_metas):
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            img_metas (list[dict]): List of image information. Pass in
+                for creating more accurate padding mask. Not used here.
+
+        Returns:
+            tuple: a tuple containing the following:
+                - mask_feature (Tensor): Shape (batch_size, c, h, w).
+                - memory (Tensor): Output of last stage of backbone.\
+                        Shape (batch_size, c, h, w).
+        """
+        y = self.last_feat_conv(feats[-1])
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        memory = feats[-1]
+        return mask_feature, memory
+
+
+@PLUGIN_LAYERS.register_module()
+class TransformerEncoderPixelDecoder(PixelDecoder):
+    """Pixel decoder with transormer encoder inside.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transorformer
+            encoder.Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`mmcv.ConfigDict` | dict):  Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='ReLU'),
+                 encoder=None,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 init_cfg=None):
+        super(TransformerEncoderPixelDecoder, self).__init__(
+            in_channels,
+            feat_channels,
+            out_channels,
+            norm_cfg,
+            act_cfg,
+            init_cfg=init_cfg)
+        self.last_feat_conv = None
+
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.encoder_embed_dims = self.encoder.embed_dims
+        assert self.encoder_embed_dims == feat_channels, 'embed_dims({}) of ' \
+            'tranformer encoder must equal to feat_channels({})'.format(
+                feat_channels, self.encoder_embed_dims)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.encoder_in_proj = Conv2d(
+            in_channels[-1], feat_channels, kernel_size=1)
+        self.encoder_out_proj = ConvModule(
+            feat_channels,
+            feat_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def init_weights(self):
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.encoder_in_proj, bias=0)
+        caffe2_xavier_init(self.encoder_out_proj.conv, bias=0)
+
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, feats, img_metas):
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            img_metas (list[dict]): List of image information. Pass in
+                for creating more accurate padding mask.
+
+        Returns:
+            tuple: a tuple containing the following:
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - memory (Tensor): shape (batch_size, c, h, w).
+        """
+        feat_last = feats[-1]
+        bs, c, h, w = feat_last.shape
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        padding_mask = feat_last.new_ones((bs, input_img_h, input_img_w),
+                                          dtype=torch.float32)
+        for i in range(bs):
+            img_h, img_w, _ = img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1),
+            size=feat_last.shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+
+        pos_embed = self.positional_encoding(padding_mask)
+        feat_last = self.encoder_in_proj(feat_last)
+        # (batch_size, c, h, w) -> (num_queries, batch_size, c)
+        feat_last = feat_last.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        # (batch_size, h, w) -> (batch_size, h*w)
+        padding_mask = padding_mask.flatten(1)
+        memory = self.encoder(
+            query=feat_last,
+            key=None,
+            value=None,
+            query_pos=pos_embed,
+            query_key_padding_mask=padding_mask)
+        # (num_queries, batch_size, c) -> (batch_size, c, h, w)
+        memory = memory.permute(1, 2, 0).view(bs, self.encoder_embed_dims, h,
+                                              w)
+        y = self.encoder_out_proj(memory)
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        return mask_feature, memory
diff --git a/mmdet/models/roi_heads/__init__.py b/mmdet/models/roi_heads/__init__.py
new file mode 100755
index 0000000..baae2a0
--- /dev/null
+++ b/mmdet/models/roi_heads/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_head import BaseRoIHead
+from .bbox_heads import (BBoxHead, ConvFCBBoxHead, DIIHead,
+                         DoubleConvFCBBoxHead, SABLHead, SCNetBBoxHead,
+                         Shared2FCBBoxHead, Shared4Conv1FCBBoxHead)
+from .cascade_roi_head import CascadeRoIHead
+from .double_roi_head import DoubleHeadRoIHead
+from .dynamic_roi_head import DynamicRoIHead
+from .grid_roi_head import GridRoIHead
+from .htc_roi_head import HybridTaskCascadeRoIHead
+from .mask_heads import (CoarseMaskHead, FCNMaskHead, FeatureRelayHead,
+                         FusedSemanticHead, GlobalContextHead, GridHead,
+                         HTCMaskHead, MaskIoUHead, MaskPointHead,
+                         SCNetMaskHead, SCNetSemanticHead)
+from .mask_scoring_roi_head import MaskScoringRoIHead
+from .pisa_roi_head import PISARoIHead
+from .point_rend_roi_head import PointRendRoIHead
+from .roi_extractors import (BaseRoIExtractor, GenericRoIExtractor,
+                             SingleRoIExtractor)
+from .scnet_roi_head import SCNetRoIHead
+from .shared_heads import ResLayer
+from .sparse_roi_head import SparseRoIHead
+from .standard_roi_head import StandardRoIHead
+from .trident_roi_head import TridentRoIHead
+
+__all__ = [
+    'BaseRoIHead', 'CascadeRoIHead', 'DoubleHeadRoIHead', 'MaskScoringRoIHead',
+    'HybridTaskCascadeRoIHead', 'GridRoIHead', 'ResLayer', 'BBoxHead',
+    'ConvFCBBoxHead', 'DIIHead', 'SABLHead', 'Shared2FCBBoxHead',
+    'StandardRoIHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead',
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'BaseRoIExtractor', 'GenericRoIExtractor',
+    'SingleRoIExtractor', 'PISARoIHead', 'PointRendRoIHead', 'MaskPointHead',
+    'CoarseMaskHead', 'DynamicRoIHead', 'SparseRoIHead', 'TridentRoIHead',
+    'SCNetRoIHead', 'SCNetMaskHead', 'SCNetSemanticHead', 'SCNetBBoxHead',
+    'FeatureRelayHead', 'GlobalContextHead'
+]
diff --git a/mmdet/models/roi_heads/base_roi_head.py b/mmdet/models/roi_heads/base_roi_head.py
new file mode 100755
index 0000000..4adbdef
--- /dev/null
+++ b/mmdet/models/roi_heads/base_roi_head.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+
+from ..builder import build_shared_head
+
+
+class BaseRoIHead(BaseModule, metaclass=ABCMeta):
+    """Base class for RoIHeads."""
+
+    def __init__(self,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 shared_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(BaseRoIHead, self).__init__(init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if shared_head is not None:
+            shared_head.pretrained = pretrained
+            self.shared_head = build_shared_head(shared_head)
+
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+        if mask_head is not None:
+            self.init_mask_head(mask_roi_extractor, mask_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_bbox(self):
+        """bool: whether the RoI head contains a `bbox_head`"""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self):
+        """bool: whether the RoI head contains a `mask_head`"""
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @property
+    def with_shared_head(self):
+        """bool: whether the RoI head contains a `shared_head`"""
+        return hasattr(self, 'shared_head') and self.shared_head is not None
+
+    @abstractmethod
+    def init_bbox_head(self):
+        """Initialize ``bbox_head``"""
+        pass
+
+    @abstractmethod
+    def init_mask_head(self):
+        """Initialize ``mask_head``"""
+        pass
+
+    @abstractmethod
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        pass
+
+    @abstractmethod
+    def forward_train(self,
+                      x,
+                      img_meta,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      **kwargs):
+        """Forward function during training."""
+
+    async def async_simple_test(self,
+                                x,
+                                proposal_list,
+                                img_metas,
+                                proposals=None,
+                                rescale=False,
+                                **kwargs):
+        """Asynchronized test function."""
+        raise NotImplementedError
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_meta,
+                    proposals=None,
+                    rescale=False,
+                    **kwargs):
+        """Test without augmentation."""
+
+    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
diff --git a/mmdet/models/roi_heads/bbox_heads/__init__.py b/mmdet/models/roi_heads/bbox_heads/__init__.py
new file mode 100755
index 0000000..d1207db
--- /dev/null
+++ b/mmdet/models/roi_heads/bbox_heads/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_head import BBoxHead
+from .convfc_bbox_head import (ConvFCBBoxHead, Shared2FCBBoxHead,
+                               Shared4Conv1FCBBoxHead)
+from .dii_head import DIIHead
+from .double_bbox_head import DoubleConvFCBBoxHead
+from .sabl_head import SABLHead
+from .scnet_bbox_head import SCNetBBoxHead
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'SABLHead', 'DIIHead',
+    'SCNetBBoxHead'
+]
diff --git a/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/mmdet/models/roi_heads/bbox_heads/bbox_head.py
new file mode 100755
index 0000000..461b18b
--- /dev/null
+++ b/mmdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -0,0 +1,594 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+from torch.nn.modules.utils import _pair
+
+from mmdet.core import build_bbox_coder, multi_apply, multiclass_nms
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.losses import accuracy
+from mmdet.models.utils import build_linear_layer
+
+
+@HEADS.register_module()
+class BBoxHead(BaseModule):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively."""
+
+    def __init__(self,
+                 with_avg_pool=False,
+                 with_cls=True,
+                 with_reg=True,
+                 roi_feat_size=7,
+                 in_channels=256,
+                 num_classes=80,
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[0., 0., 0., 0.],
+                     target_stds=[0.1, 0.1, 0.2, 0.2]),
+                 reg_class_agnostic=False,
+                 reg_decoded_bbox=False,
+                 reg_predictor_cfg=dict(type='Linear'),
+                 cls_predictor_cfg=dict(type='Linear'),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
+                 init_cfg=None):
+        super(BBoxHead, self).__init__(init_cfg)
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.reg_class_agnostic = reg_class_agnostic
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.reg_predictor_cfg = reg_predictor_cfg
+        self.cls_predictor_cfg = cls_predictor_cfg
+        self.fp16_enabled = False
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        else:
+            in_channels *= self.roi_feat_area
+        if self.with_cls:
+            # need to add background class
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = num_classes + 1
+            self.fc_cls = build_linear_layer(
+                self.cls_predictor_cfg,
+                in_features=in_channels,
+                out_features=cls_channels)
+        if self.with_reg:
+            out_dim_reg = 4 if reg_class_agnostic else 4 * num_classes
+            self.fc_reg = build_linear_layer(
+                self.reg_predictor_cfg,
+                in_features=in_channels,
+                out_features=out_dim_reg)
+        self.debug_imgs = None
+        if init_cfg is None:
+            self.init_cfg = []
+            if self.with_cls:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.01, override=dict(name='fc_cls'))
+                ]
+            if self.with_reg:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.001, override=dict(name='fc_reg'))
+                ]
+
+    @property
+    def custom_cls_channels(self):
+        return getattr(self.loss_cls, 'custom_cls_channels', False)
+
+    @property
+    def custom_activation(self):
+        return getattr(self.loss_cls, 'custom_activation', False)
+
+    @property
+    def custom_accuracy(self):
+        return getattr(self.loss_cls, 'custom_accuracy', False)
+
+    @auto_fp16()
+    def forward(self, x):
+        if self.with_avg_pool:
+            if x.numel() > 0:
+                x = self.avg_pool(x)
+                x = x.view(x.size(0), -1)
+            else:
+                # avg_pool does not support empty tensor,
+                # so use torch.mean instead it
+                x = torch.mean(x, dim=(-1, -2))
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def _get_target_single(self, pos_bboxes, neg_bboxes, pos_gt_bboxes,
+                           pos_gt_labels, cfg):
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Args:
+            pos_bboxes (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_bboxes (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals
+            in a single image. Containing the following Tensors:
+
+                - labels(Tensor): Gt_labels for all proposals, has
+                  shape (num_proposals,).
+                - label_weights(Tensor): Labels_weights for all
+                  proposals, has shape (num_proposals,).
+                - bbox_targets(Tensor):Regression target for all
+                  proposals, has shape (num_proposals, 4), the
+                  last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+                - bbox_weights(Tensor):Regression weights for all
+                  proposals, has shape (num_proposals, 4).
+        """
+        num_pos = pos_bboxes.size(0)
+        num_neg = neg_bboxes.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_bboxes.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        label_weights = pos_bboxes.new_zeros(num_samples)
+        bbox_targets = pos_bboxes.new_zeros(num_samples, 4)
+        bbox_weights = pos_bboxes.new_zeros(num_samples, 4)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[:num_pos] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_bboxes, pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = pos_gt_bboxes
+            bbox_targets[:num_pos, :] = pos_bbox_targets
+            bbox_weights[:num_pos, :] = 1
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results,
+                    gt_bboxes,
+                    gt_labels,
+                    rcnn_train_cfg,
+                    concat=True):
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_target_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            gt_bboxes (list[Tensor]): Gt_bboxes of all images in a batch,
+                each tensor has shape (num_gt, 4),  the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            gt_labels (list[Tensor]): Gt_labels of all images in a batch,
+                each tensor has shape (num_gt,).
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+                - labels (list[Tensor],Tensor): Gt_labels for all
+                  proposals in a batch, each tensor in list has
+                  shape (num_proposals,) when `concat=False`, otherwise
+                  just a single tensor has shape (num_all_proposals,).
+                - label_weights (list[Tensor]): Labels_weights for
+                  all proposals in a batch, each tensor in list has
+                  shape (num_proposals,) when `concat=False`, otherwise
+                  just a single tensor has shape (num_all_proposals,).
+                - bbox_targets (list[Tensor],Tensor): Regression target
+                  for all proposals in a batch, each tensor in list
+                  has shape (num_proposals, 4) when `concat=False`,
+                  otherwise just a single tensor has shape
+                  (num_all_proposals, 4), the last dimension 4 represents
+                  [tl_x, tl_y, br_x, br_y].
+                - bbox_weights (list[tensor],Tensor): Regression weights for
+                  all proposals in a batch, each tensor in list has shape
+                  (num_proposals, 4) when `concat=False`, otherwise just a
+                  single tensor has shape (num_all_proposals, 4).
+        """
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        neg_bboxes_list = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            neg_bboxes_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
+    def loss(self,
+             cls_score,
+             bbox_pred,
+             rois,
+             labels,
+             label_weights,
+             bbox_targets,
+             bbox_weights,
+             reduction_override=None):
+        losses = dict()
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            if cls_score.numel() > 0:
+                loss_cls_ = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                if isinstance(loss_cls_, dict):
+                    losses.update(loss_cls_)
+                else:
+                    losses['loss_cls'] = loss_cls_
+                if self.custom_activation:
+                    acc_ = self.loss_cls.get_accuracy(cls_score, labels)
+                    losses.update(acc_)
+                else:
+                    losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            bg_class_ind = self.num_classes
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            pos_inds = (labels >= 0) & (labels < bg_class_ind)
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                if self.reg_decoded_bbox:
+                    # When the regression loss (e.g. `IouLoss`,
+                    # `GIouLoss`, `DIouLoss`) is applied directly on
+                    # the decoded bounding boxes, it decodes the
+                    # already encoded coordinates to absolute format.
+                    bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred)
+                if self.reg_class_agnostic:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), 4)[pos_inds.type(torch.bool)]
+                else:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), -1,
+                        4)[pos_inds.type(torch.bool),
+                           labels[pos_inds.type(torch.bool)]]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=bbox_targets.size(0),
+                    reduction_override=reduction_override)
+            else:
+                losses['loss_bbox'] = bbox_pred[pos_inds].sum()
+        return losses
+
+    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
+    def get_bboxes(self,
+                   rois,
+                   cls_score,
+                   bbox_pred,
+                   img_shape,
+                   scale_factor,
+                   rescale=False,
+                   cfg=None):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            rois (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor, optional): Box energies / deltas.
+                has shape (num_boxes, num_classes * 4).
+            img_shape (Sequence[int], optional): Maximum bounds for boxes,
+                specifies (H, W, C) or (H, W).
+            scale_factor (ndarray): Scale factor of the
+               image arrange as (w_scale, h_scale, w_scale, h_scale).
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. Default: None
+
+        Returns:
+            tuple[Tensor, Tensor]:
+                First tensor is `det_bboxes`, has the shape
+                (num_boxes, 5) and last
+                dimension 5 represent (tl_x, tl_y, br_x, br_y, score).
+                Second tensor is the labels with shape (num_boxes, ).
+        """
+
+        # some loss (Seesaw loss..) may have custom activation
+        if self.custom_cls_channels:
+            scores = self.loss_cls.get_activation(cls_score)
+        else:
+            scores = F.softmax(
+                cls_score, dim=-1) if cls_score is not None else None
+        # bbox_pred would be None in some detector when with_reg is False,
+        # e.g. Grid R-CNN.
+        if bbox_pred is not None:
+            bboxes = self.bbox_coder.decode(
+                rois[..., 1:], bbox_pred, max_shape=img_shape)
+        else:
+            bboxes = rois[:, 1:].clone()
+            if img_shape is not None:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1])
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0])
+
+        if rescale and bboxes.size(0) > 0:
+            scale_factor = bboxes.new_tensor(scale_factor)
+            bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view(
+                bboxes.size()[0], -1)
+
+        if cfg is None:
+            return bboxes, scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(bboxes, scores,
+                                                    cfg.score_thr, cfg.nms,
+                                                    cfg.max_per_img)
+
+            return det_bboxes, det_labels
+
+    @force_fp32(apply_to=('bbox_preds', ))
+    def refine_bboxes(self, rois, labels, bbox_preds, pos_is_gts, img_metas):
+        """Refine bboxes during training.
+
+        Args:
+            rois (Tensor): Shape (n*bs, 5), where n is image number per GPU,
+                and bs is the sampled RoIs per image. The first column is
+                the image id and the next 4 columns are x1, y1, x2, y2.
+            labels (Tensor): Shape (n*bs, ).
+            bbox_preds (Tensor): Shape (n*bs, 4) or (n*bs, 4*#class).
+            pos_is_gts (list[Tensor]): Flags indicating if each positive bbox
+                is a gt bbox.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Refined bboxes of each image in a mini-batch.
+
+        Example:
+            >>> # xdoctest: +REQUIRES(module:kwarray)
+            >>> import kwarray
+            >>> import numpy as np
+            >>> from mmdet.core.bbox.demodata import random_boxes
+            >>> self = BBoxHead(reg_class_agnostic=True)
+            >>> n_roi = 2
+            >>> n_img = 4
+            >>> scale = 512
+            >>> rng = np.random.RandomState(0)
+            >>> img_metas = [{'img_shape': (scale, scale)}
+            ...              for _ in range(n_img)]
+            >>> # Create rois in the expected format
+            >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> img_ids = torch.randint(0, n_img, (n_roi,))
+            >>> img_ids = img_ids.float()
+            >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1)
+            >>> # Create other args
+            >>> labels = torch.randint(0, 2, (n_roi,)).long()
+            >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> # For each image, pretend random positive boxes are gts
+            >>> is_label_pos = (labels.numpy() > 0).astype(np.int)
+            >>> lbl_per_img = kwarray.group_items(is_label_pos,
+            ...                                   img_ids.numpy())
+            >>> pos_per_img = [sum(lbl_per_img.get(gid, []))
+            ...                for gid in range(n_img)]
+            >>> pos_is_gts = [
+            >>>     torch.randint(0, 2, (npos,)).byte().sort(
+            >>>         descending=True)[0]
+            >>>     for npos in pos_per_img
+            >>> ]
+            >>> bboxes_list = self.refine_bboxes(rois, labels, bbox_preds,
+            >>>                    pos_is_gts, img_metas)
+            >>> print(bboxes_list)
+        """
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(img_metas)
+
+        bboxes_list = []
+        for i in range(len(img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            bbox_pred_ = bbox_preds[inds]
+            img_meta_ = img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+
+            bboxes_list.append(bboxes[keep_inds.type(torch.bool)])
+
+        return bboxes_list
+
+    @force_fp32(apply_to=('bbox_pred', ))
+    def regress_by_class(self, rois, label, bbox_pred, img_meta):
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            rois (Tensor): Rois from `rpn_head` or last stage
+                `bbox_head`, has shape (num_proposals, 4) or
+                (num_proposals, 5).
+            label (Tensor): Only used when `self.reg_class_agnostic`
+                is False, has shape (num_proposals, ).
+            bbox_pred (Tensor): Regression prediction of
+                current stage `bbox_head`. When `self.reg_class_agnostic`
+                is False, it has shape (n, num_classes * 4), otherwise
+                it has shape (n, 4).
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+
+        assert rois.size(1) == 4 or rois.size(1) == 5, repr(rois.shape)
+
+        if not self.reg_class_agnostic:
+            label = label * 4
+            inds = torch.stack((label, label + 1, label + 2, label + 3), 1)
+            bbox_pred = torch.gather(bbox_pred, 1, inds)
+        assert bbox_pred.size(1) == 4
+
+        max_shape = img_meta['img_shape']
+
+        if rois.size(1) == 4:
+            new_rois = self.bbox_coder.decode(
+                rois, bbox_pred, max_shape=max_shape)
+        else:
+            bboxes = self.bbox_coder.decode(
+                rois[:, 1:], bbox_pred, max_shape=max_shape)
+            new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
+
+        return new_rois
+
+    def onnx_export(self,
+                    rois,
+                    cls_score,
+                    bbox_pred,
+                    img_shape,
+                    cfg=None,
+                    **kwargs):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            rois (Tensor): Boxes to be transformed.
+                Has shape (B, num_boxes, 5)
+            cls_score (Tensor): Box scores. has shape
+                (B, num_boxes, num_classes + 1), 1 represent the background.
+            bbox_pred (Tensor, optional): Box energies / deltas for,
+                has shape (B, num_boxes, num_classes * 4) when.
+            img_shape (torch.Tensor): Shape of image.
+            cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. Default: None
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+
+        assert rois.ndim == 3, 'Only support export two stage ' \
+                               'model to ONNX ' \
+                               'with batch dimension. '
+        if self.custom_cls_channels:
+            scores = self.loss_cls.get_activation(cls_score)
+        else:
+            scores = F.softmax(
+                cls_score, dim=-1) if cls_score is not None else None
+
+        if bbox_pred is not None:
+            bboxes = self.bbox_coder.decode(
+                rois[..., 1:], bbox_pred, max_shape=img_shape)
+        else:
+            bboxes = rois[..., 1:].clone()
+            if img_shape is not None:
+                max_shape = bboxes.new_tensor(img_shape)[..., :2]
+                min_xy = bboxes.new_tensor(0)
+                max_xy = torch.cat(
+                    [max_shape] * 2, dim=-1).flip(-1).unsqueeze(-2)
+                bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+                bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+        # Replace multiclass_nms with ONNX::NonMaxSuppression in deployment
+        from mmdet.core.export import add_dummy_nms_for_onnx
+        max_output_boxes_per_class = cfg.nms.get('max_output_boxes_per_class',
+                                                 cfg.max_per_img)
+        iou_threshold = cfg.nms.get('iou_threshold', 0.5)
+        score_threshold = cfg.score_thr
+        nms_pre = cfg.get('deploy_nms_pre', -1)
+
+        scores = scores[..., :self.num_classes]
+        if self.reg_class_agnostic:
+            return add_dummy_nms_for_onnx(
+                bboxes,
+                scores,
+                max_output_boxes_per_class,
+                iou_threshold,
+                score_threshold,
+                pre_top_k=nms_pre,
+                after_top_k=cfg.max_per_img)
+        else:
+            batch_size = scores.shape[0]
+            labels = torch.arange(
+                self.num_classes, dtype=torch.long).to(scores.device)
+            labels = labels.view(1, 1, -1).expand_as(scores)
+            labels = labels.reshape(batch_size, -1)
+            scores = scores.reshape(batch_size, -1)
+            bboxes = bboxes.reshape(batch_size, -1, 4)
+
+            max_size = torch.max(img_shape)
+            # Offset bboxes of each class so that bboxes of different labels
+            #  do not overlap.
+            offsets = (labels * max_size + 1).unsqueeze(2)
+            bboxes_for_nms = bboxes + offsets
+
+            batch_dets, labels = add_dummy_nms_for_onnx(
+                bboxes_for_nms,
+                scores.unsqueeze(2),
+                max_output_boxes_per_class,
+                iou_threshold,
+                score_threshold,
+                pre_top_k=nms_pre,
+                after_top_k=cfg.max_per_img,
+                labels=labels)
+            # Offset the bboxes back after dummy nms.
+            offsets = (labels * max_size + 1).unsqueeze(2)
+            # Indexing + inplace operation fails with dynamic shape in ONNX
+            # original style: batch_dets[..., :4] -= offsets
+            bboxes, scores = batch_dets[..., 0:4], batch_dets[..., 4:5]
+            bboxes -= offsets
+            batch_dets = torch.cat([bboxes, scores], dim=2)
+            return batch_dets, labels
diff --git a/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
new file mode 100755
index 0000000..21124b9
--- /dev/null
+++ b/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmdet.models.builder import HEADS
+from mmdet.models.utils import build_linear_layer
+from .bbox_head import BBoxHead
+
+
+@HEADS.register_module()
+class ConvFCBBoxHead(BBoxHead):
+    r"""More general bbox head, with shared conv and fc layers and two optional
+    separated branches.
+
+    .. code-block:: none
+
+                                    /-> cls convs -> cls fcs -> cls
+        shared convs -> shared fcs
+                                    \-> reg convs -> reg fcs -> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_shared_convs=0,
+                 num_shared_fcs=0,
+                 num_cls_convs=0,
+                 num_cls_fcs=0,
+                 num_reg_convs=0,
+                 num_reg_fcs=0,
+                 conv_out_channels=256,
+                 fc_out_channels=1024,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 *args,
+                 **kwargs):
+        super(ConvFCBBoxHead, self).__init__(
+            *args, init_cfg=init_cfg, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+
+        # add cls specific branch
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+
+        # add reg specific branch
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+
+        if self.num_shared_fcs == 0 and not self.with_avg_pool:
+            if self.num_cls_fcs == 0:
+                self.cls_last_dim *= self.roi_feat_area
+            if self.num_reg_fcs == 0:
+                self.reg_last_dim *= self.roi_feat_area
+
+        self.relu = nn.ReLU(inplace=True)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        if self.with_cls:
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = self.num_classes + 1
+            self.fc_cls = build_linear_layer(
+                self.cls_predictor_cfg,
+                in_features=self.cls_last_dim,
+                out_features=cls_channels)
+        if self.with_reg:
+            out_dim_reg = (4 if self.reg_class_agnostic else 4 *
+                           self.num_classes)
+            self.fc_reg = build_linear_layer(
+                self.reg_predictor_cfg,
+                in_features=self.reg_last_dim,
+                out_features=out_dim_reg)
+
+        if init_cfg is None:
+            # when init_cfg is None,
+            # It has been set to
+            # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))],
+            #  [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))]
+            # after `super(ConvFCBBoxHead, self).__init__()`
+            # we only need to append additional configuration
+            # for `shared_fcs`, `cls_fcs` and `reg_fcs`
+            self.init_cfg += [
+                dict(
+                    type='Xavier',
+                    distribution='uniform',
+                    override=[
+                        dict(name='shared_fcs'),
+                        dict(name='cls_fcs'),
+                        dict(name='reg_fcs')
+                    ])
+            ]
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs,
+                            num_branch_fcs,
+                            in_channels,
+                            is_shared=False):
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x):
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        return cls_score, bbox_pred
+
+
+@HEADS.register_module()
+class Shared2FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels=1024, *args, **kwargs):
+        super(Shared2FCBBoxHead, self).__init__(
+            num_shared_convs=0,
+            num_shared_fcs=2,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
+
+
+@HEADS.register_module()
+class Shared4Conv1FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels=1024, *args, **kwargs):
+        super(Shared4Conv1FCBBoxHead, self).__init__(
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
diff --git a/mmdet/models/roi_heads/bbox_heads/dii_head.py b/mmdet/models/roi_heads/bbox_heads/dii_head.py
new file mode 100755
index 0000000..3777f52
--- /dev/null
+++ b/mmdet/models/roi_heads/bbox_heads/dii_head.py
@@ -0,0 +1,426 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (bias_init_with_prob, build_activation_layer,
+                      build_norm_layer)
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.runner import auto_fp16, force_fp32
+
+from mmdet.core import multi_apply
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.dense_heads.atss_head import reduce_mean
+from mmdet.models.losses import accuracy
+from mmdet.models.utils import build_transformer
+from .bbox_head import BBoxHead
+
+
+@HEADS.register_module()
+class DIIHead(BBoxHead):
+    r"""Dynamic Instance Interactive Head for `Sparse R-CNN: End-to-End Object
+    Detection with Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+
+    Args:
+        num_classes (int): Number of class in dataset.
+            Defaults to 80.
+        num_ffn_fcs (int): The number of fully-connected
+            layers in FFNs. Defaults to 2.
+        num_heads (int): The hidden dimension of FFNs.
+            Defaults to 8.
+        num_cls_fcs (int): The number of fully-connected
+            layers in classification subnet. Defaults to 1.
+        num_reg_fcs (int): The number of fully-connected
+            layers in regression subnet. Defaults to 3.
+        feedforward_channels (int): The hidden dimension
+            of FFNs. Defaults to 2048
+        in_channels (int): Hidden_channels of MultiheadAttention.
+            Defaults to 256.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        ffn_act_cfg (dict): The activation config for FFNs.
+        dynamic_conv_cfg (dict): The convolution config
+            for DynamicConv.
+        loss_iou (dict): The config for iou or giou loss.
+
+    """
+
+    def __init__(self,
+                 num_classes=80,
+                 num_ffn_fcs=2,
+                 num_heads=8,
+                 num_cls_fcs=1,
+                 num_reg_fcs=3,
+                 feedforward_channels=2048,
+                 in_channels=256,
+                 dropout=0.0,
+                 ffn_act_cfg=dict(type='ReLU', inplace=True),
+                 dynamic_conv_cfg=dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=7,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(DIIHead, self).__init__(
+            num_classes=num_classes,
+            reg_decoded_bbox=True,
+            reg_class_agnostic=True,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_iou = build_loss(loss_iou)
+        self.in_channels = in_channels
+        self.fp16_enabled = False
+        self.attention = MultiheadAttention(in_channels, num_heads, dropout)
+        self.attention_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.instance_interactive_conv = build_transformer(dynamic_conv_cfg)
+        self.instance_interactive_conv_dropout = nn.Dropout(dropout)
+        self.instance_interactive_conv_norm = build_norm_layer(
+            dict(type='LN'), in_channels)[1]
+
+        self.ffn = FFN(
+            in_channels,
+            feedforward_channels,
+            num_ffn_fcs,
+            act_cfg=ffn_act_cfg,
+            dropout=dropout)
+        self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.cls_fcs = nn.ModuleList()
+        for _ in range(num_cls_fcs):
+            self.cls_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.cls_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.cls_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+
+        # over load the self.fc_cls in BBoxHead
+        if self.loss_cls.use_sigmoid:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes)
+        else:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)
+
+        self.reg_fcs = nn.ModuleList()
+        for _ in range(num_reg_fcs):
+            self.reg_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.reg_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.reg_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+        # over load the self.fc_cls in BBoxHead
+        self.fc_reg = nn.Linear(in_channels, 4)
+
+        assert self.reg_class_agnostic, 'DIIHead only ' \
+            'suppport `reg_class_agnostic=True` '
+        assert self.reg_decoded_bbox, 'DIIHead only ' \
+            'suppport `reg_decoded_bbox=True`'
+
+    def init_weights(self):
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        super(DIIHead, self).init_weights()
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            else:
+                # adopt the default initialization for
+                # the weight and bias of the layer norm
+                pass
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+
+    @auto_fp16()
+    def forward(self, roi_feat, proposal_feat):
+        """Forward function of Dynamic Instance Interactive Head.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size, num_proposals, feature_dimensions)
+
+          Returns:
+                tuple[Tensor]: Usually a tuple of classification scores
+                and bbox prediction and a intermediate feature.
+
+                    - cls_scores (Tensor): Classification scores for
+                      all proposals, has shape
+                      (batch_size, num_proposals, num_classes).
+                    - bbox_preds (Tensor): Box energies / deltas for
+                      all proposals, has shape
+                      (batch_size, num_proposals, 4).
+                    - obj_feat (Tensor): Object feature before classification
+                      and regression subnet, has shape
+                      (batch_size, num_proposal, feature_dimensions).
+        """
+        N, num_proposals = proposal_feat.shape[:2]
+
+        # Self attention
+        proposal_feat = proposal_feat.permute(1, 0, 2)
+        proposal_feat = self.attention_norm(self.attention(proposal_feat))
+        attn_feats = proposal_feat.permute(1, 0, 2)
+
+        # instance interactive
+        proposal_feat = attn_feats.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+        proposal_feat = proposal_feat + self.instance_interactive_conv_dropout(
+            proposal_feat_iic)
+        obj_feat = self.instance_interactive_conv_norm(proposal_feat)
+
+        # FFN
+        obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        cls_feat = obj_feat
+        reg_feat = obj_feat
+
+        for cls_layer in self.cls_fcs:
+            cls_feat = cls_layer(cls_feat)
+        for reg_layer in self.reg_fcs:
+            reg_feat = reg_layer(reg_feat)
+
+        cls_score = self.fc_cls(cls_feat).view(
+            N, num_proposals, self.num_classes
+            if self.loss_cls.use_sigmoid else self.num_classes + 1)
+        bbox_delta = self.fc_reg(reg_feat).view(N, num_proposals, 4)
+
+        return cls_score, bbox_delta, obj_feat.view(
+            N, num_proposals, self.in_channels), attn_feats
+
+    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
+    def loss(self,
+             cls_score,
+             bbox_pred,
+             labels,
+             label_weights,
+             bbox_targets,
+             bbox_weights,
+             imgs_whwh=None,
+             reduction_override=None,
+             **kwargs):
+        """"Loss function of DIIHead, get loss of all images.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results,
+                has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            labels (Tensor): Label of each proposals, has shape
+                (batch_size * num_proposals_single_image
+            label_weights (Tensor): Classification loss
+                weight of each proposals, has shape
+                (batch_size * num_proposals_single_image
+            bbox_targets (Tensor): Regression targets of each
+                proposals, has shape
+                (batch_size * num_proposals_single_image, 4),
+                the last dimension 4 represents
+                [tl_x, tl_y, br_x, br_y].
+            bbox_weights (Tensor): Regression loss weight of each
+                proposals's coordinate, has shape
+                (batch_size * num_proposals_single_image, 4),
+            imgs_whwh (Tensor): imgs_whwh (Tensor): Tensor with\
+                shape (batch_size, num_proposals, 4), the last
+                dimension means
+                [img_width,img_height, img_width, img_height].
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+            Returns:
+                dict[str, Tensor]: Dictionary of loss components
+        """
+        losses = dict()
+        bg_class_ind = self.num_classes
+        # note in spare rcnn num_gt == num_pos
+        pos_inds = (labels >= 0) & (labels < bg_class_ind)
+        num_pos = pos_inds.sum().float()
+        avg_factor = reduce_mean(num_pos)
+        if cls_score is not None:
+            if cls_score.numel() > 0:
+                losses['loss_cls'] = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                losses['pos_acc'] = accuracy(cls_score[pos_inds],
+                                             labels[pos_inds])
+        if bbox_pred is not None:
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                pos_bbox_pred = bbox_pred.reshape(bbox_pred.size(0),
+                                                  4)[pos_inds.type(torch.bool)]
+                imgs_whwh = imgs_whwh.reshape(bbox_pred.size(0),
+                                              4)[pos_inds.type(torch.bool)]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred / imgs_whwh,
+                    bbox_targets[pos_inds.type(torch.bool)] / imgs_whwh,
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+                losses['loss_iou'] = self.loss_iou(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+            else:
+                losses['loss_bbox'] = bbox_pred.sum() * 0
+                losses['loss_iou'] = bbox_pred.sum() * 0
+        return losses
+
+    def _get_target_single(self, pos_inds, neg_inds, pos_bboxes, neg_bboxes,
+                           pos_gt_bboxes, pos_gt_labels, cfg):
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Almost the same as the implementation in `bbox_head`,
+        we add pos_inds and neg_inds to select positive and
+        negative samples instead of selecting the first num_pos
+        as positive samples.
+
+        Args:
+            pos_inds (Tensor): The length is equal to the
+                positive sample numbers contain all index
+                of the positive sample in the origin proposal set.
+            neg_inds (Tensor): The length is equal to the
+                negative sample numbers contain all index
+                of the negative sample in the origin proposal set.
+            pos_bboxes (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_bboxes (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following Tensors:
+
+                - labels(Tensor): Gt_labels for all proposals, has
+                  shape (num_proposals,).
+                - label_weights(Tensor): Labels_weights for all proposals, has
+                  shape (num_proposals,).
+                - bbox_targets(Tensor):Regression target for all proposals, has
+                  shape (num_proposals, 4), the last dimension 4
+                  represents [tl_x, tl_y, br_x, br_y].
+                - bbox_weights(Tensor):Regression weights for all proposals,
+                  has shape (num_proposals, 4).
+        """
+        num_pos = pos_bboxes.size(0)
+        num_neg = neg_bboxes.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_bboxes.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        label_weights = pos_bboxes.new_zeros(num_samples)
+        bbox_targets = pos_bboxes.new_zeros(num_samples, 4)
+        bbox_weights = pos_bboxes.new_zeros(num_samples, 4)
+        if num_pos > 0:
+            labels[pos_inds] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[pos_inds] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_bboxes, pos_gt_bboxes)
+            else:
+                pos_bbox_targets = pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1
+        if num_neg > 0:
+            label_weights[neg_inds] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results,
+                    gt_bboxes,
+                    gt_labels,
+                    rcnn_train_cfg,
+                    concat=True):
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_target_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            gt_bboxes (list[Tensor]): Gt_bboxes of all images in a batch,
+                each tensor has shape (num_gt, 4),  the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            gt_labels (list[Tensor]): Gt_labels of all images in a batch,
+                each tensor has shape (num_gt,).
+            rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+                - labels (list[Tensor],Tensor): Gt_labels for all
+                  proposals in a batch, each tensor in list has
+                  shape (num_proposals,) when `concat=False`, otherwise just
+                  a single tensor has shape (num_all_proposals,).
+                - label_weights (list[Tensor]): Labels_weights for
+                  all proposals in a batch, each tensor in list has shape
+                  (num_proposals,) when `concat=False`, otherwise just a
+                  single tensor has shape (num_all_proposals,).
+                - bbox_targets (list[Tensor],Tensor): Regression target
+                  for all proposals in a batch, each tensor in list has
+                  shape (num_proposals, 4) when `concat=False`, otherwise
+                  just a single tensor has shape (num_all_proposals, 4),
+                  the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+                - bbox_weights (list[tensor],Tensor): Regression weights for
+                  all proposals in a batch, each tensor in list has shape
+                  (num_proposals, 4) when `concat=False`, otherwise just a
+                  single tensor has shape (num_all_proposals, 4).
+        """
+        pos_inds_list = [res.pos_inds for res in sampling_results]
+        neg_inds_list = [res.neg_inds for res in sampling_results]
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        neg_bboxes_list = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_target_single,
+            pos_inds_list,
+            neg_inds_list,
+            pos_bboxes_list,
+            neg_bboxes_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
diff --git a/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py b/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
new file mode 100755
index 0000000..2a38d59
--- /dev/null
+++ b/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, ModuleList
+
+from mmdet.models.backbones.resnet import Bottleneck
+from mmdet.models.builder import HEADS
+from .bbox_head import BBoxHead
+
+
+class BasicResBlock(BaseModule):
+    """Basic residual block.
+
+    This block is a little different from the block in the ResNet backbone.
+    The kernel size of conv1 is 1 in this block while 3 in ResNet BasicBlock.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        out_channels (int): Channels of the output feature map.
+        conv_cfg (dict): The config dict for convolution layers.
+        norm_cfg (dict): The config dict for normalization layers.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=None):
+        super(BasicResBlock, self).__init__(init_cfg)
+
+        # main path
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        # identity path
+        self.conv_identity = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        identity = x
+
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        identity = self.conv_identity(identity)
+        out = x + identity
+
+        out = self.relu(out)
+        return out
+
+
+@HEADS.register_module()
+class DoubleConvFCBBoxHead(BBoxHead):
+    r"""Bbox head used in Double-Head R-CNN
+
+    .. code-block:: none
+
+                                          /-> cls
+                      /-> shared convs ->
+                                          \-> reg
+        roi features
+                                          /-> cls
+                      \-> shared fc    ->
+                                          \-> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_convs=0,
+                 num_fcs=0,
+                 conv_out_channels=1024,
+                 fc_out_channels=1024,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=dict(
+                     type='Normal',
+                     override=[
+                         dict(type='Normal', name='fc_cls', std=0.01),
+                         dict(type='Normal', name='fc_reg', std=0.001),
+                         dict(
+                             type='Xavier',
+                             name='fc_branch',
+                             distribution='uniform')
+                     ]),
+                 **kwargs):
+        kwargs.setdefault('with_avg_pool', True)
+        super(DoubleConvFCBBoxHead, self).__init__(init_cfg=init_cfg, **kwargs)
+        assert self.with_avg_pool
+        assert num_convs > 0
+        assert num_fcs > 0
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # increase the channel of input features
+        self.res_block = BasicResBlock(self.in_channels,
+                                       self.conv_out_channels)
+
+        # add conv heads
+        self.conv_branch = self._add_conv_branch()
+        # add fc heads
+        self.fc_branch = self._add_fc_branch()
+
+        out_dim_reg = 4 if self.reg_class_agnostic else 4 * self.num_classes
+        self.fc_reg = nn.Linear(self.conv_out_channels, out_dim_reg)
+
+        self.fc_cls = nn.Linear(self.fc_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def _add_conv_branch(self):
+        """Add the fc branch which consists of a sequential of conv layers."""
+        branch_convs = ModuleList()
+        for i in range(self.num_convs):
+            branch_convs.append(
+                Bottleneck(
+                    inplanes=self.conv_out_channels,
+                    planes=self.conv_out_channels // 4,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        return branch_convs
+
+    def _add_fc_branch(self):
+        """Add the fc branch which consists of a sequential of fc layers."""
+        branch_fcs = ModuleList()
+        for i in range(self.num_fcs):
+            fc_in_channels = (
+                self.in_channels *
+                self.roi_feat_area if i == 0 else self.fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
+        return branch_fcs
+
+    def forward(self, x_cls, x_reg):
+        # conv head
+        x_conv = self.res_block(x_reg)
+
+        for conv in self.conv_branch:
+            x_conv = conv(x_conv)
+
+        if self.with_avg_pool:
+            x_conv = self.avg_pool(x_conv)
+
+        x_conv = x_conv.view(x_conv.size(0), -1)
+        bbox_pred = self.fc_reg(x_conv)
+
+        # fc head
+        x_fc = x_cls.view(x_cls.size(0), -1)
+        for fc in self.fc_branch:
+            x_fc = self.relu(fc(x_fc))
+
+        cls_score = self.fc_cls(x_fc)
+
+        return cls_score, bbox_pred
diff --git a/mmdet/models/roi_heads/bbox_heads/sabl_head.py b/mmdet/models/roi_heads/bbox_heads/sabl_head.py
new file mode 100755
index 0000000..0ce986b
--- /dev/null
+++ b/mmdet/models/roi_heads/bbox_heads/sabl_head.py
@@ -0,0 +1,596 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, force_fp32
+
+from mmdet.core import build_bbox_coder, multi_apply, multiclass_nms
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.losses import accuracy
+
+
+@HEADS.register_module()
+class SABLHead(BaseModule):
+    """Side-Aware Boundary Localization (SABL) for RoI-Head.
+
+    Side-Aware features are extracted by conv layers
+    with an attention mechanism.
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented in BucketingBBoxCoder.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        cls_in_channels (int): Input channels of cls RoI feature. \
+            Defaults to 256.
+        reg_in_channels (int): Input channels of reg RoI feature. \
+            Defaults to 256.
+        roi_feat_size (int): Size of RoI features. Defaults to 7.
+        reg_feat_up_ratio (int): Upsample ratio of reg features. \
+            Defaults to 2.
+        reg_pre_kernel (int): Kernel of 2D conv layers before \
+            attention pooling. Defaults to 3.
+        reg_post_kernel (int): Kernel of 1D conv layers after \
+            attention pooling. Defaults to 3.
+        reg_pre_num (int): Number of pre convs. Defaults to 2.
+        reg_post_num (int): Number of post convs. Defaults to 1.
+        num_classes (int): Number of classes in dataset. Defaults to 80.
+        cls_out_channels (int): Hidden channels in cls fcs. Defaults to 1024.
+        reg_offset_out_channels (int): Hidden and output channel \
+            of reg offset branch. Defaults to 256.
+        reg_cls_out_channels (int): Hidden and output channel \
+            of reg cls branch. Defaults to 256.
+        num_cls_fcs (int): Number of fcs for cls branch. Defaults to 1.
+        num_reg_fcs (int): Number of fcs for reg branch.. Defaults to 0.
+        reg_class_agnostic (bool): Class agnostic regression or not. \
+            Defaults to True.
+        norm_cfg (dict): Config of norm layers. Defaults to None.
+        bbox_coder (dict): Config of bbox coder. Defaults 'BucketingBBoxCoder'.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_cls (dict): Config of classification loss for bbox branch.
+        loss_bbox_reg (dict): Config of regression loss for bbox branch.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 cls_in_channels=256,
+                 reg_in_channels=256,
+                 roi_feat_size=7,
+                 reg_feat_up_ratio=2,
+                 reg_pre_kernel=3,
+                 reg_post_kernel=3,
+                 reg_pre_num=2,
+                 reg_post_num=1,
+                 cls_out_channels=1024,
+                 reg_offset_out_channels=256,
+                 reg_cls_out_channels=256,
+                 num_cls_fcs=1,
+                 num_reg_fcs=0,
+                 reg_class_agnostic=True,
+                 norm_cfg=None,
+                 bbox_coder=dict(
+                     type='BucketingBBoxCoder',
+                     num_buckets=14,
+                     scale_factor=1.7),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox_reg=dict(
+                     type='SmoothL1Loss', beta=0.1, loss_weight=1.0),
+                 init_cfg=None):
+        super(SABLHead, self).__init__(init_cfg)
+        self.cls_in_channels = cls_in_channels
+        self.reg_in_channels = reg_in_channels
+        self.roi_feat_size = roi_feat_size
+        self.reg_feat_up_ratio = int(reg_feat_up_ratio)
+        self.num_buckets = bbox_coder['num_buckets']
+        assert self.reg_feat_up_ratio // 2 >= 1
+        self.up_reg_feat_size = roi_feat_size * self.reg_feat_up_ratio
+        assert self.up_reg_feat_size == bbox_coder['num_buckets']
+        self.reg_pre_kernel = reg_pre_kernel
+        self.reg_post_kernel = reg_post_kernel
+        self.reg_pre_num = reg_pre_num
+        self.reg_post_num = reg_post_num
+        self.num_classes = num_classes
+        self.cls_out_channels = cls_out_channels
+        self.reg_offset_out_channels = reg_offset_out_channels
+        self.reg_cls_out_channels = reg_cls_out_channels
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_fcs = num_reg_fcs
+        self.reg_class_agnostic = reg_class_agnostic
+        assert self.reg_class_agnostic
+        self.norm_cfg = norm_cfg
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox_cls = build_loss(loss_bbox_cls)
+        self.loss_bbox_reg = build_loss(loss_bbox_reg)
+
+        self.cls_fcs = self._add_fc_branch(self.num_cls_fcs,
+                                           self.cls_in_channels,
+                                           self.roi_feat_size,
+                                           self.cls_out_channels)
+
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        if self.reg_feat_up_ratio > 1:
+            self.upsample_x = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+            self.upsample_y = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+
+        self.reg_pre_convs = nn.ModuleList()
+        for i in range(self.reg_pre_num):
+            reg_pre_conv = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=reg_pre_kernel,
+                padding=reg_pre_kernel // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_pre_convs.append(reg_pre_conv)
+
+        self.reg_post_conv_xs = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_x = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(1, reg_post_kernel),
+                padding=(0, reg_post_kernel // 2),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_xs.append(reg_post_conv_x)
+        self.reg_post_conv_ys = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_y = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(reg_post_kernel, 1),
+                padding=(reg_post_kernel // 2, 0),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_ys.append(reg_post_conv_y)
+
+        self.reg_conv_att_x = nn.Conv2d(reg_in_channels, 1, 1)
+        self.reg_conv_att_y = nn.Conv2d(reg_in_channels, 1, 1)
+
+        self.fc_cls = nn.Linear(self.cls_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.reg_cls_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                               self.reg_in_channels, 1,
+                                               self.reg_cls_out_channels)
+        self.reg_offset_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                                  self.reg_in_channels, 1,
+                                                  self.reg_offset_out_channels)
+        self.fc_reg_cls = nn.Linear(self.reg_cls_out_channels, 1)
+        self.fc_reg_offset = nn.Linear(self.reg_offset_out_channels, 1)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(
+                    type='Xavier',
+                    layer='Linear',
+                    distribution='uniform',
+                    override=[
+                        dict(type='Normal', name='reg_conv_att_x', std=0.01),
+                        dict(type='Normal', name='reg_conv_att_y', std=0.01),
+                        dict(type='Normal', name='fc_reg_cls', std=0.01),
+                        dict(type='Normal', name='fc_cls', std=0.01),
+                        dict(type='Normal', name='fc_reg_offset', std=0.001)
+                    ])
+            ]
+            if self.reg_feat_up_ratio > 1:
+                self.init_cfg += [
+                    dict(
+                        type='Kaiming',
+                        distribution='normal',
+                        override=[
+                            dict(name='upsample_x'),
+                            dict(name='upsample_y')
+                        ])
+                ]
+
+    @property
+    def custom_cls_channels(self):
+        return getattr(self.loss_cls, 'custom_cls_channels', False)
+
+    @property
+    def custom_activation(self):
+        return getattr(self.loss_cls, 'custom_activation', False)
+
+    @property
+    def custom_accuracy(self):
+        return getattr(self.loss_cls, 'custom_accuracy', False)
+
+    def _add_fc_branch(self, num_branch_fcs, in_channels, roi_feat_size,
+                       fc_out_channels):
+        in_channels = in_channels * roi_feat_size * roi_feat_size
+        branch_fcs = nn.ModuleList()
+        for i in range(num_branch_fcs):
+            fc_in_channels = (in_channels if i == 0 else fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, fc_out_channels))
+        return branch_fcs
+
+    def cls_forward(self, cls_x):
+        cls_x = cls_x.view(cls_x.size(0), -1)
+        for fc in self.cls_fcs:
+            cls_x = self.relu(fc(cls_x))
+        cls_score = self.fc_cls(cls_x)
+        return cls_score
+
+    def attention_pool(self, reg_x):
+        """Extract direction-specific features fx and fy with attention
+        methanism."""
+        reg_fx = reg_x
+        reg_fy = reg_x
+        reg_fx_att = self.reg_conv_att_x(reg_fx).sigmoid()
+        reg_fy_att = self.reg_conv_att_y(reg_fy).sigmoid()
+        reg_fx_att = reg_fx_att / reg_fx_att.sum(dim=2).unsqueeze(2)
+        reg_fy_att = reg_fy_att / reg_fy_att.sum(dim=3).unsqueeze(3)
+        reg_fx = (reg_fx * reg_fx_att).sum(dim=2)
+        reg_fy = (reg_fy * reg_fy_att).sum(dim=3)
+        return reg_fx, reg_fy
+
+    def side_aware_feature_extractor(self, reg_x):
+        """Refine and extract side-aware features without split them."""
+        for reg_pre_conv in self.reg_pre_convs:
+            reg_x = reg_pre_conv(reg_x)
+        reg_fx, reg_fy = self.attention_pool(reg_x)
+
+        if self.reg_post_num > 0:
+            reg_fx = reg_fx.unsqueeze(2)
+            reg_fy = reg_fy.unsqueeze(3)
+            for i in range(self.reg_post_num):
+                reg_fx = self.reg_post_conv_xs[i](reg_fx)
+                reg_fy = self.reg_post_conv_ys[i](reg_fy)
+            reg_fx = reg_fx.squeeze(2)
+            reg_fy = reg_fy.squeeze(3)
+        if self.reg_feat_up_ratio > 1:
+            reg_fx = self.relu(self.upsample_x(reg_fx))
+            reg_fy = self.relu(self.upsample_y(reg_fy))
+        reg_fx = torch.transpose(reg_fx, 1, 2)
+        reg_fy = torch.transpose(reg_fy, 1, 2)
+        return reg_fx.contiguous(), reg_fy.contiguous()
+
+    def reg_pred(self, x, offset_fcs, cls_fcs):
+        """Predict bucketing estimation (cls_pred) and fine regression (offset
+        pred) with side-aware features."""
+        x_offset = x.view(-1, self.reg_in_channels)
+        x_cls = x.view(-1, self.reg_in_channels)
+
+        for fc in offset_fcs:
+            x_offset = self.relu(fc(x_offset))
+        for fc in cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+        offset_pred = self.fc_reg_offset(x_offset)
+        cls_pred = self.fc_reg_cls(x_cls)
+
+        offset_pred = offset_pred.view(x.size(0), -1)
+        cls_pred = cls_pred.view(x.size(0), -1)
+
+        return offset_pred, cls_pred
+
+    def side_aware_split(self, feat):
+        """Split side-aware features aligned with orders of bucketing
+        targets."""
+        l_end = int(np.ceil(self.up_reg_feat_size / 2))
+        r_start = int(np.floor(self.up_reg_feat_size / 2))
+        feat_fl = feat[:, :l_end]
+        feat_fr = feat[:, r_start:].flip(dims=(1, ))
+        feat_fl = feat_fl.contiguous()
+        feat_fr = feat_fr.contiguous()
+        feat = torch.cat([feat_fl, feat_fr], dim=-1)
+        return feat
+
+    def bbox_pred_split(self, bbox_pred, num_proposals_per_img):
+        """Split batch bbox prediction back to each image."""
+        bucket_cls_preds, bucket_offset_preds = bbox_pred
+        bucket_cls_preds = bucket_cls_preds.split(num_proposals_per_img, 0)
+        bucket_offset_preds = bucket_offset_preds.split(
+            num_proposals_per_img, 0)
+        bbox_pred = tuple(zip(bucket_cls_preds, bucket_offset_preds))
+        return bbox_pred
+
+    def reg_forward(self, reg_x):
+        outs = self.side_aware_feature_extractor(reg_x)
+        edge_offset_preds = []
+        edge_cls_preds = []
+        reg_fx = outs[0]
+        reg_fy = outs[1]
+        offset_pred_x, cls_pred_x = self.reg_pred(reg_fx, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_y, cls_pred_y = self.reg_pred(reg_fy, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_x = self.side_aware_split(offset_pred_x)
+        offset_pred_y = self.side_aware_split(offset_pred_y)
+        cls_pred_x = self.side_aware_split(cls_pred_x)
+        cls_pred_y = self.side_aware_split(cls_pred_y)
+        edge_offset_preds = torch.cat([offset_pred_x, offset_pred_y], dim=-1)
+        edge_cls_preds = torch.cat([cls_pred_x, cls_pred_y], dim=-1)
+
+        return (edge_cls_preds, edge_offset_preds)
+
+    def forward(self, x):
+
+        bbox_pred = self.reg_forward(x)
+        cls_score = self.cls_forward(x)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self, sampling_results, gt_bboxes, gt_labels,
+                    rcnn_train_cfg):
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        neg_proposals = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_reg_targets = self.bucket_target(pos_proposals, neg_proposals,
+                                             pos_gt_bboxes, pos_gt_labels,
+                                             rcnn_train_cfg)
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = cls_reg_targets
+        return (labels, label_weights, (bucket_cls_targets,
+                                        bucket_offset_targets),
+                (bucket_cls_weights, bucket_offset_weights))
+
+    def bucket_target(self,
+                      pos_proposals_list,
+                      neg_proposals_list,
+                      pos_gt_bboxes_list,
+                      pos_gt_labels_list,
+                      rcnn_train_cfg,
+                      concat=True):
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = multi_apply(
+             self._bucket_target_single,
+             pos_proposals_list,
+             neg_proposals_list,
+             pos_gt_bboxes_list,
+             pos_gt_labels_list,
+             cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bucket_cls_targets = torch.cat(bucket_cls_targets, 0)
+            bucket_cls_weights = torch.cat(bucket_cls_weights, 0)
+            bucket_offset_targets = torch.cat(bucket_offset_targets, 0)
+            bucket_offset_weights = torch.cat(bucket_offset_weights, 0)
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def _bucket_target_single(self, pos_proposals, neg_proposals,
+                              pos_gt_bboxes, pos_gt_labels, cfg):
+        """Compute bucketing estimation targets and fine regression targets for
+        a single image.
+
+        Args:
+            pos_proposals (Tensor): positive proposals of a single image,
+                 Shape (n_pos, 4)
+            neg_proposals (Tensor): negative proposals of a single image,
+                 Shape (n_neg, 4).
+            pos_gt_bboxes (Tensor): gt bboxes assigned to positive proposals
+                 of a single image, Shape (n_pos, 4).
+            pos_gt_labels (Tensor): gt labels assigned to positive proposals
+                 of a single image, Shape (n_pos, ).
+            cfg (dict): Config of calculating targets
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels in a single image. \
+                    Shape (n,).
+                - label_weights (Tensor): Label weights in a single image.\
+                    Shape (n,)
+                - bucket_cls_targets (Tensor): Bucket cls targets in \
+                    a single image. Shape (n, num_buckets*2).
+                - bucket_cls_weights (Tensor): Bucket cls weights in \
+                    a single image. Shape (n, num_buckets*2).
+                - bucket_offset_targets (Tensor): Bucket offset targets \
+                    in a single image. Shape (n, num_buckets*2).
+                - bucket_offset_targets (Tensor): Bucket offset weights \
+                    in a single image. Shape (n, num_buckets*2).
+        """
+        num_pos = pos_proposals.size(0)
+        num_neg = neg_proposals.size(0)
+        num_samples = num_pos + num_neg
+        labels = pos_gt_bboxes.new_full((num_samples, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+        label_weights = pos_proposals.new_zeros(num_samples)
+        bucket_cls_targets = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_cls_weights = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_offset_targets = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        bucket_offset_weights = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            label_weights[:num_pos] = 1.0
+            (pos_bucket_offset_targets, pos_bucket_offset_weights,
+             pos_bucket_cls_targets,
+             pos_bucket_cls_weights) = self.bbox_coder.encode(
+                 pos_proposals, pos_gt_bboxes)
+            bucket_cls_targets[:num_pos, :] = pos_bucket_cls_targets
+            bucket_cls_weights[:num_pos, :] = pos_bucket_cls_weights
+            bucket_offset_targets[:num_pos, :] = pos_bucket_offset_targets
+            bucket_offset_weights[:num_pos, :] = pos_bucket_offset_weights
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def loss(self,
+             cls_score,
+             bbox_pred,
+             rois,
+             labels,
+             label_weights,
+             bbox_targets,
+             bbox_weights,
+             reduction_override=None):
+        losses = dict()
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            losses['loss_cls'] = self.loss_cls(
+                cls_score,
+                labels,
+                label_weights,
+                avg_factor=avg_factor,
+                reduction_override=reduction_override)
+            losses['acc'] = accuracy(cls_score, labels)
+
+        if bbox_pred is not None:
+            bucket_cls_preds, bucket_offset_preds = bbox_pred
+            bucket_cls_targets, bucket_offset_targets = bbox_targets
+            bucket_cls_weights, bucket_offset_weights = bbox_weights
+            # edge cls
+            bucket_cls_preds = bucket_cls_preds.view(-1, self.side_num)
+            bucket_cls_targets = bucket_cls_targets.view(-1, self.side_num)
+            bucket_cls_weights = bucket_cls_weights.view(-1, self.side_num)
+            losses['loss_bbox_cls'] = self.loss_bbox_cls(
+                bucket_cls_preds,
+                bucket_cls_targets,
+                bucket_cls_weights,
+                avg_factor=bucket_cls_targets.size(0),
+                reduction_override=reduction_override)
+
+            losses['loss_bbox_reg'] = self.loss_bbox_reg(
+                bucket_offset_preds,
+                bucket_offset_targets,
+                bucket_offset_weights,
+                avg_factor=bucket_offset_targets.size(0),
+                reduction_override=reduction_override)
+
+        return losses
+
+    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
+    def get_bboxes(self,
+                   rois,
+                   cls_score,
+                   bbox_pred,
+                   img_shape,
+                   scale_factor,
+                   rescale=False,
+                   cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+
+        if bbox_pred is not None:
+            bboxes, confidences = self.bbox_coder.decode(
+                rois[:, 1:], bbox_pred, img_shape)
+        else:
+            bboxes = rois[:, 1:].clone()
+            confidences = None
+            if img_shape is not None:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1] - 1)
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0] - 1)
+
+        if rescale and bboxes.size(0) > 0:
+            if isinstance(scale_factor, float):
+                bboxes /= scale_factor
+            else:
+                bboxes /= torch.from_numpy(scale_factor).to(bboxes.device)
+
+        if cfg is None:
+            return bboxes, scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                cfg.score_thr,
+                cfg.nms,
+                cfg.max_per_img,
+                score_factors=confidences)
+
+            return det_bboxes, det_labels
+
+    @force_fp32(apply_to=('bbox_preds', ))
+    def refine_bboxes(self, rois, labels, bbox_preds, pos_is_gts, img_metas):
+        """Refine bboxes during training.
+
+        Args:
+            rois (Tensor): Shape (n*bs, 5), where n is image number per GPU,
+                and bs is the sampled RoIs per image.
+            labels (Tensor): Shape (n*bs, ).
+            bbox_preds (list[Tensor]): Shape [(n*bs, num_buckets*2), \
+                (n*bs, num_buckets*2)].
+            pos_is_gts (list[Tensor]): Flags indicating if each positive bbox
+                is a gt bbox.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Refined bboxes of each image in a mini-batch.
+        """
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() == len(img_metas)
+
+        bboxes_list = []
+        for i in range(len(img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            edge_cls_preds, edge_offset_preds = bbox_preds
+            edge_cls_preds_ = edge_cls_preds[inds]
+            edge_offset_preds_ = edge_offset_preds[inds]
+            bbox_pred_ = [edge_cls_preds_, edge_offset_preds_]
+            img_meta_ = img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+
+            bboxes_list.append(bboxes[keep_inds.type(torch.bool)])
+
+        return bboxes_list
+
+    @force_fp32(apply_to=('bbox_pred', ))
+    def regress_by_class(self, rois, label, bbox_pred, img_meta):
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            rois (Tensor): shape (n, 4) or (n, 5)
+            label (Tensor): shape (n, )
+            bbox_pred (list[Tensor]): shape [(n, num_buckets *2), \
+                (n, num_buckets *2)]
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        assert rois.size(1) == 4 or rois.size(1) == 5
+
+        if rois.size(1) == 4:
+            new_rois, _ = self.bbox_coder.decode(rois, bbox_pred,
+                                                 img_meta['img_shape'])
+        else:
+            bboxes, _ = self.bbox_coder.decode(rois[:, 1:], bbox_pred,
+                                               img_meta['img_shape'])
+            new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
+
+        return new_rois
diff --git a/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py b/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
new file mode 100755
index 0000000..cf39ebe
--- /dev/null
+++ b/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.builder import HEADS
+from .convfc_bbox_head import ConvFCBBoxHead
+
+
+@HEADS.register_module()
+class SCNetBBoxHead(ConvFCBBoxHead):
+    """BBox head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    This inherits ``ConvFCBBoxHead`` with modified forward() function, allow us
+    to get intermediate shared feature.
+    """
+
+    def _forward_shared(self, x):
+        """Forward function for shared part."""
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+
+        return x
+
+    def _forward_cls_reg(self, x):
+        """Forward function for classification and regression parts."""
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+
+        return cls_score, bbox_pred
+
+    def forward(self, x, return_shared_feat=False):
+        """Forward function.
+
+        Args:
+            x (Tensor): input features
+            return_shared_feat (bool): If True, return cls-reg-shared feature.
+
+        Return:
+            out (tuple[Tensor]): contain ``cls_score`` and ``bbox_pred``,
+                if  ``return_shared_feat`` is True, append ``x_shared`` to the
+                returned tuple.
+        """
+        x_shared = self._forward_shared(x)
+        out = self._forward_cls_reg(x_shared)
+
+        if return_shared_feat:
+            out += (x_shared, )
+
+        return out
diff --git a/mmdet/models/roi_heads/cascade_roi_head.py b/mmdet/models/roi_heads/cascade_roi_head.py
new file mode 100755
index 0000000..e17313f
--- /dev/null
+++ b/mmdet/models/roi_heads/cascade_roi_head.py
@@ -0,0 +1,631 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.runner import ModuleList
+
+from mmdet.core import (bbox2result, bbox2roi, bbox_mapping, build_assigner,
+                        build_sampler, merge_aug_bboxes, merge_aug_masks,
+                        multiclass_nms)
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_roi_head import BaseRoIHead
+from .test_mixins import BBoxTestMixin, MaskTestMixin
+
+
+@HEADS.register_module()
+class CascadeRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin):
+    """Cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1712.00726
+    """
+
+    def __init__(self,
+                 num_stages,
+                 stage_loss_weights,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 shared_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert shared_head is None, \
+            'Shared head is not supported in Cascade RCNN anymore'
+
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        super(CascadeRoIHead, self).__init__(
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            shared_head=shared_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict): Config of box roi extractor.
+            bbox_head (dict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = ModuleList()
+        self.bbox_head = ModuleList()
+        if not isinstance(bbox_roi_extractor, list):
+            bbox_roi_extractor = [
+                bbox_roi_extractor for _ in range(self.num_stages)
+            ]
+        if not isinstance(bbox_head, list):
+            bbox_head = [bbox_head for _ in range(self.num_stages)]
+        assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages
+        for roi_extractor, head in zip(bbox_roi_extractor, bbox_head):
+            self.bbox_roi_extractor.append(build_roi_extractor(roi_extractor))
+            self.bbox_head.append(build_head(head))
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_roi_extractor (dict): Config of mask roi extractor.
+            mask_head (dict): Config of mask in mask head.
+        """
+        self.mask_head = nn.ModuleList()
+        if not isinstance(mask_head, list):
+            mask_head = [mask_head for _ in range(self.num_stages)]
+        assert len(mask_head) == self.num_stages
+        for head in mask_head:
+            self.mask_head.append(build_head(head))
+        if mask_roi_extractor is not None:
+            self.share_roi_extractor = False
+            self.mask_roi_extractor = ModuleList()
+            if not isinstance(mask_roi_extractor, list):
+                mask_roi_extractor = [
+                    mask_roi_extractor for _ in range(self.num_stages)
+                ]
+            assert len(mask_roi_extractor) == self.num_stages
+            for roi_extractor in mask_roi_extractor:
+                self.mask_roi_extractor.append(
+                    build_roi_extractor(roi_extractor))
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler for each stage."""
+        self.bbox_assigner = []
+        self.bbox_sampler = []
+        if self.train_cfg is not None:
+            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
+                self.bbox_assigner.append(
+                    build_assigner(rcnn_train_cfg.assigner))
+                self.current_stage = idx
+                self.bbox_sampler.append(
+                    build_sampler(rcnn_train_cfg.sampler, context=self))
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        # bbox head
+        outs = ()
+        rois = bbox2roi([proposals])
+        if self.with_bbox:
+            for i in range(self.num_stages):
+                bbox_results = self._bbox_forward(i, x, rois)
+                outs = outs + (bbox_results['cls_score'],
+                               bbox_results['bbox_pred'])
+        # mask heads
+        if self.with_mask:
+            mask_rois = rois[:100]
+            for i in range(self.num_stages):
+                mask_results = self._mask_forward(i, x, mask_rois)
+                outs = outs + (mask_results['mask_pred'], )
+        return outs
+
+    def _bbox_forward(self, stage, x, rois):
+        """Box head forward function used in both training and testing."""
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def _bbox_forward_train(self, stage, x, sampling_results, gt_bboxes,
+                            gt_labels, rcnn_train_cfg):
+        """Run forward function and calculate loss for box head in training."""
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(stage, x, rois)
+        bbox_targets = self.bbox_head[stage].get_targets(
+            sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg)
+        loss_bbox = self.bbox_head[stage].loss(bbox_results['cls_score'],
+                                               bbox_results['bbox_pred'], rois,
+                                               *bbox_targets)
+
+        bbox_results.update(
+            loss_bbox=loss_bbox, rois=rois, bbox_targets=bbox_targets)
+        return bbox_results
+
+    def _mask_forward(self, stage, x, rois):
+        """Mask head forward function used in both training and testing."""
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_pred = mask_head(mask_feats)
+
+        mask_results = dict(mask_pred=mask_pred)
+        return mask_results
+
+    def _mask_forward_train(self,
+                            stage,
+                            x,
+                            sampling_results,
+                            gt_masks,
+                            rcnn_train_cfg,
+                            bbox_feats=None):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        mask_results = self._mask_forward(stage, x, pos_rois)
+
+        mask_targets = self.mask_head[stage].get_targets(
+            sampling_results, gt_masks, rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = self.mask_head[stage].loss(mask_results['mask_pred'],
+                                               mask_targets, pos_labels)
+
+        mask_results.update(loss_mask=loss_mask)
+        return mask_results
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            proposals (list[Tensors]): list of region proposals.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        losses = dict()
+        for i in range(self.num_stages):
+            self.current_stage = i
+            rcnn_train_cfg = self.train_cfg[i]
+            lw = self.stage_loss_weights[i]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            if self.with_bbox or self.with_mask:
+                bbox_assigner = self.bbox_assigner[i]
+                bbox_sampler = self.bbox_sampler[i]
+                num_imgs = len(img_metas)
+                if gt_bboxes_ignore is None:
+                    gt_bboxes_ignore = [None for _ in range(num_imgs)]
+
+                for j in range(num_imgs):
+                    assign_result = bbox_assigner.assign(
+                        proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j],
+                        gt_labels[j])
+                    sampling_result = bbox_sampler.sample(
+                        assign_result,
+                        proposal_list[j],
+                        gt_bboxes[j],
+                        gt_labels[j],
+                        feats=[lvl_feat[j][None] for lvl_feat in x])
+                    sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self._bbox_forward_train(i, x, sampling_results,
+                                                    gt_bboxes, gt_labels,
+                                                    rcnn_train_cfg)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{i}.{name}'] = (
+                    value * lw if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                mask_results = self._mask_forward_train(
+                    i, x, sampling_results, gt_masks, rcnn_train_cfg,
+                    bbox_results['bbox_feats'])
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{i}.{name}'] = (
+                        value * lw if 'loss' in name else value)
+
+            # refine bboxes
+            if i < self.num_stages - 1:
+                pos_is_gts = [res.pos_is_gt for res in sampling_results]
+                # bbox_targets is a tuple
+                roi_labels = bbox_results['bbox_targets'][0]
+                with torch.no_grad():
+                    cls_score = bbox_results['cls_score']
+                    if self.bbox_head[i].custom_activation:
+                        cls_score = self.bbox_head[i].loss_cls.get_activation(
+                            cls_score)
+
+                    # Empty proposal.
+                    if cls_score.numel() == 0:
+                        break
+
+                    roi_labels = torch.where(
+                        roi_labels == self.bbox_head[i].num_classes,
+                        cls_score[:, :-1].argmax(1), roi_labels)
+                    proposal_list = self.bbox_head[i].refine_bboxes(
+                        bbox_results['rois'], roi_labels,
+                        bbox_results['bbox_pred'], pos_is_gts, img_metas)
+
+        return losses
+
+    def simple_test(self, x, proposal_list, img_metas, rescale=False):
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (batch_size, c, h, w).
+            proposal_list (list(Tensor)): Proposals from rpn head.
+                Each has shape (num_proposals, 5), last dimension
+                5 represent (x1, y1, x2, y2, score).
+            img_metas (list[dict]): Meta information of images.
+            rescale (bool): Whether to rescale the results to
+                the original image. Default: True.
+
+        Returns:
+            list[list[np.ndarray]] or list[tuple]: When no mask branch,
+            it is bbox results of each image and classes with type
+            `list[list[np.ndarray]]`. The outer list
+            corresponds to each image. The inner list
+            corresponds to each class. When the model has mask branch,
+            it contains bbox results and mask results.
+            The outer list corresponds to each image, and first element
+            of tuple is bbox results, second element is mask results.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        num_imgs = len(proposal_list)
+        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        # "ms" in variable names means multi-stage
+        ms_bbox_result = {}
+        ms_segm_result = {}
+        ms_scores = []
+        rcnn_test_cfg = self.test_cfg
+
+        rois = bbox2roi(proposal_list)
+
+        if rois.shape[0] == 0:
+            # There is no proposal in the whole batch
+            bbox_results = [[
+                np.zeros((0, 5), dtype=np.float32)
+                for _ in range(self.bbox_head[-1].num_classes)
+            ]] * num_imgs
+
+            if self.with_mask:
+                mask_classes = self.mask_head[-1].num_classes
+                segm_results = [[[] for _ in range(mask_classes)]
+                                for _ in range(num_imgs)]
+                results = list(zip(bbox_results, segm_results))
+            else:
+                results = bbox_results
+
+            return results
+
+        for i in range(self.num_stages):
+            bbox_results = self._bbox_forward(i, x, rois)
+
+            # split batch bbox prediction back to each image
+            cls_score = bbox_results['cls_score']
+            bbox_pred = bbox_results['bbox_pred']
+            num_proposals_per_img = tuple(
+                len(proposals) for proposals in proposal_list)
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_score = cls_score.split(num_proposals_per_img, 0)
+            if isinstance(bbox_pred, torch.Tensor):
+                bbox_pred = bbox_pred.split(num_proposals_per_img, 0)
+            else:
+                bbox_pred = self.bbox_head[i].bbox_pred_split(
+                    bbox_pred, num_proposals_per_img)
+            ms_scores.append(cls_score)
+
+            if i < self.num_stages - 1:
+                if self.bbox_head[i].custom_activation:
+                    cls_score = [
+                        self.bbox_head[i].loss_cls.get_activation(s)
+                        for s in cls_score
+                    ]
+                refine_rois_list = []
+                for j in range(num_imgs):
+                    if rois[j].shape[0] > 0:
+                        bbox_label = cls_score[j][:, :-1].argmax(dim=1)
+                        refined_rois = self.bbox_head[i].regress_by_class(
+                            rois[j], bbox_label, bbox_pred[j], img_metas[j])
+                        refine_rois_list.append(refined_rois)
+                rois = torch.cat(refine_rois_list)
+
+        # average scores of each image by stages
+        cls_score = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(num_imgs)
+        ]
+
+        # apply bbox post-processing to each image individually
+        det_bboxes = []
+        det_labels = []
+        for i in range(num_imgs):
+            det_bbox, det_label = self.bbox_head[-1].get_bboxes(
+                rois[i],
+                cls_score[i],
+                bbox_pred[i],
+                img_shapes[i],
+                scale_factors[i],
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+
+        bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head[-1].num_classes)
+            for i in range(num_imgs)
+        ]
+        ms_bbox_result['ensemble'] = bbox_results
+
+        if self.with_mask:
+            if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+                mask_classes = self.mask_head[-1].num_classes
+                segm_results = [[[] for _ in range(mask_classes)]
+                                for _ in range(num_imgs)]
+            else:
+                if rescale and not isinstance(scale_factors[0], float):
+                    scale_factors = [
+                        torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                        for scale_factor in scale_factors
+                    ]
+                _bboxes = [
+                    det_bboxes[i][:, :4] *
+                    scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                    for i in range(len(det_bboxes))
+                ]
+                mask_rois = bbox2roi(_bboxes)
+                num_mask_rois_per_img = tuple(
+                    _bbox.size(0) for _bbox in _bboxes)
+                aug_masks = []
+                for i in range(self.num_stages):
+                    mask_results = self._mask_forward(i, x, mask_rois)
+                    mask_pred = mask_results['mask_pred']
+                    # split batch mask prediction back to each image
+                    mask_pred = mask_pred.split(num_mask_rois_per_img, 0)
+                    aug_masks.append([
+                        m.sigmoid().cpu().detach().numpy() for m in mask_pred
+                    ])
+
+                # apply mask post-processing to each image individually
+                segm_results = []
+                for i in range(num_imgs):
+                    if det_bboxes[i].shape[0] == 0:
+                        segm_results.append(
+                            [[]
+                             for _ in range(self.mask_head[-1].num_classes)])
+                    else:
+                        aug_mask = [mask[i] for mask in aug_masks]
+                        merged_masks = merge_aug_masks(
+                            aug_mask, [[img_metas[i]]] * self.num_stages,
+                            rcnn_test_cfg)
+                        segm_result = self.mask_head[-1].get_seg_masks(
+                            merged_masks, _bboxes[i], det_labels[i],
+                            rcnn_test_cfg, ori_shapes[i], scale_factors[i],
+                            rescale)
+                        segm_results.append(segm_result)
+            ms_segm_result['ensemble'] = segm_results
+
+        if self.with_mask:
+            results = list(
+                zip(ms_bbox_result['ensemble'], ms_segm_result['ensemble']))
+        else:
+            results = ms_bbox_result['ensemble']
+
+        return results
+
+    def aug_test(self, features, proposal_list, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        rcnn_test_cfg = self.test_cfg
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(features, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            # "ms" in variable names means multi-stage
+            ms_scores = []
+
+            rois = bbox2roi([proposals])
+
+            if rois.shape[0] == 0:
+                # There is no proposal in the single image
+                aug_bboxes.append(rois.new_zeros(0, 4))
+                aug_scores.append(rois.new_zeros(0, 1))
+                continue
+
+            for i in range(self.num_stages):
+                bbox_results = self._bbox_forward(i, x, rois)
+                ms_scores.append(bbox_results['cls_score'])
+
+                if i < self.num_stages - 1:
+                    cls_score = bbox_results['cls_score']
+                    if self.bbox_head[i].custom_activation:
+                        cls_score = self.bbox_head[i].loss_cls.get_activation(
+                            cls_score)
+                    bbox_label = cls_score[:, :-1].argmax(dim=1)
+                    rois = self.bbox_head[i].regress_by_class(
+                        rois, bbox_label, bbox_results['bbox_pred'],
+                        img_meta[0])
+
+            cls_score = sum(ms_scores) / float(len(ms_scores))
+            bboxes, scores = self.bbox_head[-1].get_bboxes(
+                rois,
+                cls_score,
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+
+        bbox_result = bbox2result(det_bboxes, det_labels,
+                                  self.bbox_head[-1].num_classes)
+
+        if self.with_mask:
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[]
+                               for _ in range(self.mask_head[-1].num_classes)]
+            else:
+                aug_masks = []
+                aug_img_metas = []
+                for x, img_meta in zip(features, img_metas):
+                    img_shape = img_meta[0]['img_shape']
+                    scale_factor = img_meta[0]['scale_factor']
+                    flip = img_meta[0]['flip']
+                    flip_direction = img_meta[0]['flip_direction']
+                    _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                           scale_factor, flip, flip_direction)
+                    mask_rois = bbox2roi([_bboxes])
+                    for i in range(self.num_stages):
+                        mask_results = self._mask_forward(i, x, mask_rois)
+                        aug_masks.append(
+                            mask_results['mask_pred'].sigmoid().cpu().numpy())
+                        aug_img_metas.append(img_meta)
+                merged_masks = merge_aug_masks(aug_masks, aug_img_metas,
+                                               self.test_cfg)
+
+                ori_shape = img_metas[0][0]['ori_shape']
+                dummy_scale_factor = np.ones(4)
+                segm_result = self.mask_head[-1].get_seg_masks(
+                    merged_masks,
+                    det_bboxes,
+                    det_labels,
+                    rcnn_test_cfg,
+                    ori_shape,
+                    scale_factor=dummy_scale_factor,
+                    rescale=False)
+            return [(bbox_result, segm_result)]
+        else:
+            return [bbox_result]
+
+    def onnx_export(self, x, proposals, img_metas):
+
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        assert proposals.shape[0] == 1, 'Only support one input image ' \
+                                        'while in exporting to ONNX'
+        # remove the scores
+        rois = proposals[..., :-1]
+        batch_size = rois.shape[0]
+        num_proposals_per_img = rois.shape[1]
+        # Eliminate the batch dimension
+        rois = rois.view(-1, 4)
+
+        # add dummy batch index
+        rois = torch.cat([rois.new_zeros(rois.shape[0], 1), rois], dim=-1)
+
+        max_shape = img_metas[0]['img_shape_for_onnx']
+        ms_scores = []
+        rcnn_test_cfg = self.test_cfg
+
+        for i in range(self.num_stages):
+            bbox_results = self._bbox_forward(i, x, rois)
+
+            cls_score = bbox_results['cls_score']
+            bbox_pred = bbox_results['bbox_pred']
+            # Recover the batch dimension
+            rois = rois.reshape(batch_size, num_proposals_per_img,
+                                rois.size(-1))
+            cls_score = cls_score.reshape(batch_size, num_proposals_per_img,
+                                          cls_score.size(-1))
+            bbox_pred = bbox_pred.reshape(batch_size, num_proposals_per_img, 4)
+            ms_scores.append(cls_score)
+            if i < self.num_stages - 1:
+                assert self.bbox_head[i].reg_class_agnostic
+                new_rois = self.bbox_head[i].bbox_coder.decode(
+                    rois[..., 1:], bbox_pred, max_shape=max_shape)
+                rois = new_rois.reshape(-1, new_rois.shape[-1])
+                # add dummy batch index
+                rois = torch.cat([rois.new_zeros(rois.shape[0], 1), rois],
+                                 dim=-1)
+
+        cls_score = sum(ms_scores) / float(len(ms_scores))
+        bbox_pred = bbox_pred.reshape(batch_size, num_proposals_per_img, 4)
+        rois = rois.reshape(batch_size, num_proposals_per_img, -1)
+        det_bboxes, det_labels = self.bbox_head[-1].onnx_export(
+            rois, cls_score, bbox_pred, max_shape, cfg=rcnn_test_cfg)
+
+        if not self.with_mask:
+            return det_bboxes, det_labels
+        else:
+            batch_index = torch.arange(
+                det_bboxes.size(0),
+                device=det_bboxes.device).float().view(-1, 1, 1).expand(
+                    det_bboxes.size(0), det_bboxes.size(1), 1)
+            rois = det_bboxes[..., :4]
+            mask_rois = torch.cat([batch_index, rois], dim=-1)
+            mask_rois = mask_rois.view(-1, 5)
+            aug_masks = []
+            for i in range(self.num_stages):
+                mask_results = self._mask_forward(i, x, mask_rois)
+                mask_pred = mask_results['mask_pred']
+                aug_masks.append(mask_pred)
+            max_shape = img_metas[0]['img_shape_for_onnx']
+            # calculate the mean of masks from several stage
+            mask_pred = sum(aug_masks) / len(aug_masks)
+            segm_results = self.mask_head[-1].onnx_export(
+                mask_pred, rois.reshape(-1, 4), det_labels.reshape(-1),
+                self.test_cfg, max_shape)
+            segm_results = segm_results.reshape(batch_size,
+                                                det_bboxes.shape[1],
+                                                max_shape[0], max_shape[1])
+            return det_bboxes, det_labels, segm_results
diff --git a/mmdet/models/roi_heads/double_roi_head.py b/mmdet/models/roi_heads/double_roi_head.py
new file mode 100755
index 0000000..895b5d3
--- /dev/null
+++ b/mmdet/models/roi_heads/double_roi_head.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class DoubleHeadRoIHead(StandardRoIHead):
+    """RoI head for Double Head RCNN.
+
+    https://arxiv.org/abs/1904.06493
+    """
+
+    def __init__(self, reg_roi_scale_factor, **kwargs):
+        super(DoubleHeadRoIHead, self).__init__(**kwargs)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x, rois):
+        """Box head forward function used in both training and testing time."""
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
diff --git a/mmdet/models/roi_heads/dynamic_roi_head.py b/mmdet/models/roi_heads/dynamic_roi_head.py
new file mode 100755
index 0000000..4c2b6cd
--- /dev/null
+++ b/mmdet/models/roi_heads/dynamic_roi_head.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core import bbox2roi
+from mmdet.models.losses import SmoothL1Loss
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+EPS = 1e-15
+
+
+@HEADS.register_module()
+class DynamicRoIHead(StandardRoIHead):
+    """RoI head for `Dynamic R-CNN <https://arxiv.org/abs/2004.06002>`_."""
+
+    def __init__(self, **kwargs):
+        super(DynamicRoIHead, self).__init__(**kwargs)
+        assert isinstance(self.bbox_head.loss_bbox, SmoothL1Loss)
+        # the IoU history of the past `update_iter_interval` iterations
+        self.iou_history = []
+        # the beta history of the past `update_iter_interval` iterations
+        self.beta_history = []
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """Forward function for training.
+
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+            proposals (list[Tensors]): list of region proposals.
+
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+
+            gt_labels (list[Tensor]): class indices corresponding to each box
+
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # assign gts and sample proposals
+        if self.with_bbox or self.with_mask:
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            cur_iou = []
+            for i in range(num_imgs):
+                assign_result = self.bbox_assigner.assign(
+                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
+                    gt_labels[i])
+                sampling_result = self.bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                # record the `iou_topk`-th largest IoU in an image
+                iou_topk = min(self.train_cfg.dynamic_rcnn.iou_topk,
+                               len(assign_result.max_overlaps))
+                ious, _ = torch.topk(assign_result.max_overlaps, iou_topk)
+                cur_iou.append(ious[-1].item())
+                sampling_results.append(sampling_result)
+            # average the current IoUs over images
+            cur_iou = np.mean(cur_iou)
+            self.iou_history.append(cur_iou)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(x, sampling_results,
+                                                    gt_bboxes, gt_labels,
+                                                    img_metas)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self._mask_forward_train(x, sampling_results,
+                                                    bbox_results['bbox_feats'],
+                                                    gt_masks, img_metas)
+            losses.update(mask_results['loss_mask'])
+
+        # update IoU threshold and SmoothL1 beta
+        update_iter_interval = self.train_cfg.dynamic_rcnn.update_iter_interval
+        if len(self.iou_history) % update_iter_interval == 0:
+            new_iou_thr, new_beta = self.update_hyperparameters()
+
+        return losses
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+        num_imgs = len(img_metas)
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.train_cfg)
+        # record the `beta_topk`-th smallest target
+        # `bbox_targets[2]` and `bbox_targets[3]` stand for bbox_targets
+        # and bbox_weights, respectively
+        pos_inds = bbox_targets[3][:, 0].nonzero().squeeze(1)
+        num_pos = len(pos_inds)
+        cur_target = bbox_targets[2][pos_inds, :2].abs().mean(dim=1)
+        beta_topk = min(self.train_cfg.dynamic_rcnn.beta_topk * num_imgs,
+                        num_pos)
+        cur_target = torch.kthvalue(cur_target, beta_topk)[0].item()
+        self.beta_history.append(cur_target)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def update_hyperparameters(self):
+        """Update hyperparameters like IoU thresholds for assigner and beta for
+        SmoothL1 loss based on the training statistics.
+
+        Returns:
+            tuple[float]: the updated ``iou_thr`` and ``beta``.
+        """
+        new_iou_thr = max(self.train_cfg.dynamic_rcnn.initial_iou,
+                          np.mean(self.iou_history))
+        self.iou_history = []
+        self.bbox_assigner.pos_iou_thr = new_iou_thr
+        self.bbox_assigner.neg_iou_thr = new_iou_thr
+        self.bbox_assigner.min_pos_iou = new_iou_thr
+        if (np.median(self.beta_history) < EPS):
+            # avoid 0 or too small value for new_beta
+            new_beta = self.bbox_head.loss_bbox.beta
+        else:
+            new_beta = min(self.train_cfg.dynamic_rcnn.initial_beta,
+                           np.median(self.beta_history))
+        self.beta_history = []
+        self.bbox_head.loss_bbox.beta = new_beta
+        return new_iou_thr, new_beta
diff --git a/mmdet/models/roi_heads/grid_roi_head.py b/mmdet/models/roi_heads/grid_roi_head.py
new file mode 100755
index 0000000..333f629
--- /dev/null
+++ b/mmdet/models/roi_heads/grid_roi_head.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core import bbox2result, bbox2roi
+from ..builder import HEADS, build_head, build_roi_extractor
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class GridRoIHead(StandardRoIHead):
+    """Grid roi head for Grid R-CNN.
+
+    https://arxiv.org/abs/1811.12030
+    """
+
+    def __init__(self, grid_roi_extractor, grid_head, **kwargs):
+        assert grid_head is not None
+        super(GridRoIHead, self).__init__(**kwargs)
+        if grid_roi_extractor is not None:
+            self.grid_roi_extractor = build_roi_extractor(grid_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.grid_roi_extractor = self.bbox_roi_extractor
+        self.grid_head = build_head(grid_head)
+
+    def _random_jitter(self, sampling_results, img_metas, amplitude=0.15):
+        """Ramdom jitter positive proposals for training."""
+        for sampling_result, img_meta in zip(sampling_results, img_metas):
+            bboxes = sampling_result.pos_bboxes
+            random_offsets = bboxes.new_empty(bboxes.shape[0], 4).uniform_(
+                -amplitude, amplitude)
+            # before jittering
+            cxcy = (bboxes[:, 2:4] + bboxes[:, :2]) / 2
+            wh = (bboxes[:, 2:4] - bboxes[:, :2]).abs()
+            # after jittering
+            new_cxcy = cxcy + wh * random_offsets[:, :2]
+            new_wh = wh * (1 + random_offsets[:, 2:])
+            # xywh to xyxy
+            new_x1y1 = (new_cxcy - new_wh / 2)
+            new_x2y2 = (new_cxcy + new_wh / 2)
+            new_bboxes = torch.cat([new_x1y1, new_x2y2], dim=1)
+            # clip bboxes
+            max_shape = img_meta['img_shape']
+            if max_shape is not None:
+                new_bboxes[:, 0::2].clamp_(min=0, max=max_shape[1] - 1)
+                new_bboxes[:, 1::2].clamp_(min=0, max=max_shape[0] - 1)
+
+            sampling_result.pos_bboxes = new_bboxes
+        return sampling_results
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        # bbox head
+        outs = ()
+        rois = bbox2roi([proposals])
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            outs = outs + (bbox_results['cls_score'],
+                           bbox_results['bbox_pred'])
+
+        # grid head
+        grid_rois = rois[:100]
+        grid_feats = self.grid_roi_extractor(
+            x[:self.grid_roi_extractor.num_inputs], grid_rois)
+        if self.with_shared_head:
+            grid_feats = self.shared_head(grid_feats)
+        grid_pred = self.grid_head(grid_feats)
+        outs = outs + (grid_pred, )
+
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            outs = outs + (mask_results['mask_pred'], )
+        return outs
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+        """Run forward function and calculate loss for box head in training."""
+        bbox_results = super(GridRoIHead,
+                             self)._bbox_forward_train(x, sampling_results,
+                                                       gt_bboxes, gt_labels,
+                                                       img_metas)
+
+        # Grid head forward and loss
+        sampling_results = self._random_jitter(sampling_results, img_metas)
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        # GN in head does not support zero shape input
+        if pos_rois.shape[0] == 0:
+            return bbox_results
+
+        grid_feats = self.grid_roi_extractor(
+            x[:self.grid_roi_extractor.num_inputs], pos_rois)
+        if self.with_shared_head:
+            grid_feats = self.shared_head(grid_feats)
+        # Accelerate training
+        max_sample_num_grid = self.train_cfg.get('max_num_grid', 192)
+        sample_idx = torch.randperm(
+            grid_feats.shape[0])[:min(grid_feats.shape[0], max_sample_num_grid
+                                      )]
+        grid_feats = grid_feats[sample_idx]
+
+        grid_pred = self.grid_head(grid_feats)
+
+        grid_targets = self.grid_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+        grid_targets = grid_targets[sample_idx]
+
+        loss_grid = self.grid_head.loss(grid_pred, grid_targets)
+
+        bbox_results['loss_bbox'].update(loss_grid)
+        return bbox_results
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=False)
+        # pack rois into bboxes
+        grid_rois = bbox2roi([det_bbox[:, :4] for det_bbox in det_bboxes])
+        if grid_rois.shape[0] != 0:
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            self.grid_head.test_mode = True
+            grid_pred = self.grid_head(grid_feats)
+            # split batch grid head prediction back to each image
+            num_roi_per_img = tuple(len(det_bbox) for det_bbox in det_bboxes)
+            grid_pred = {
+                k: v.split(num_roi_per_img, 0)
+                for k, v in grid_pred.items()
+            }
+
+            # apply bbox post-processing to each image individually
+            bbox_results = []
+            num_imgs = len(det_bboxes)
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    bbox_results.append([
+                        np.zeros((0, 5), dtype=np.float32)
+                        for _ in range(self.bbox_head.num_classes)
+                    ])
+                else:
+                    det_bbox = self.grid_head.get_bboxes(
+                        det_bboxes[i], grid_pred['fused'][i], [img_metas[i]])
+                    if rescale:
+                        det_bbox[:, :4] /= img_metas[i]['scale_factor']
+                    bbox_results.append(
+                        bbox2result(det_bbox, det_labels[i],
+                                    self.bbox_head.num_classes))
+        else:
+            bbox_results = [[
+                np.zeros((0, 5), dtype=np.float32)
+                for _ in range(self.bbox_head.num_classes)
+            ] for _ in range(len(det_bboxes))]
+
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = self.simple_test_mask(
+                x, img_metas, det_bboxes, det_labels, rescale=rescale)
+            return list(zip(bbox_results, segm_results))
diff --git a/mmdet/models/roi_heads/htc_roi_head.py b/mmdet/models/roi_heads/htc_roi_head.py
new file mode 100755
index 0000000..86a6db1
--- /dev/null
+++ b/mmdet/models/roi_heads/htc_roi_head.py
@@ -0,0 +1,628 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from mmdet.core import (bbox2result, bbox2roi, bbox_mapping, merge_aug_bboxes,
+                        merge_aug_masks, multiclass_nms)
+from ..builder import HEADS, build_head, build_roi_extractor
+from ..utils.brick_wrappers import adaptive_avg_pool2d
+from .cascade_roi_head import CascadeRoIHead
+
+
+@HEADS.register_module()
+class HybridTaskCascadeRoIHead(CascadeRoIHead):
+    """Hybrid task cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1901.07518
+    """
+
+    def __init__(self,
+                 num_stages,
+                 stage_loss_weights,
+                 semantic_roi_extractor=None,
+                 semantic_head=None,
+                 semantic_fusion=('bbox', 'mask'),
+                 interleaved=True,
+                 mask_info_flow=True,
+                 **kwargs):
+        super(HybridTaskCascadeRoIHead,
+              self).__init__(num_stages, stage_loss_weights, **kwargs)
+        assert self.with_bbox
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = build_roi_extractor(
+                semantic_roi_extractor)
+            self.semantic_head = build_head(semantic_head)
+
+        self.semantic_fusion = semantic_fusion
+        self.interleaved = interleaved
+        self.mask_info_flow = mask_info_flow
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic head"""
+        if hasattr(self, 'semantic_head') and self.semantic_head is not None:
+            return True
+        else:
+            return False
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        outs = ()
+        # semantic head
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+        # bbox heads
+        rois = bbox2roi([proposals])
+        for i in range(self.num_stages):
+            bbox_results = self._bbox_forward(
+                i, x, rois, semantic_feat=semantic_feat)
+            outs = outs + (bbox_results['cls_score'],
+                           bbox_results['bbox_pred'])
+        # mask heads
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_roi_extractor = self.mask_roi_extractor[-1]
+            mask_feats = mask_roi_extractor(
+                x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
+            if self.with_semantic and 'mask' in self.semantic_fusion:
+                mask_semantic_feat = self.semantic_roi_extractor(
+                    [semantic_feat], mask_rois)
+                mask_feats = mask_feats + mask_semantic_feat
+            last_feat = None
+            for i in range(self.num_stages):
+                mask_head = self.mask_head[i]
+                if self.mask_info_flow:
+                    mask_pred, last_feat = mask_head(mask_feats, last_feat)
+                else:
+                    mask_pred = mask_head(mask_feats)
+                outs = outs + (mask_pred, )
+        return outs
+
+    def _bbox_forward_train(self,
+                            stage,
+                            x,
+                            sampling_results,
+                            gt_bboxes,
+                            gt_labels,
+                            rcnn_train_cfg,
+                            semantic_feat=None):
+        """Run forward function and calculate loss for box head in training."""
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage, x, rois, semantic_feat=semantic_feat)
+
+        bbox_targets = bbox_head.get_targets(sampling_results, gt_bboxes,
+                                             gt_labels, rcnn_train_cfg)
+        loss_bbox = bbox_head.loss(bbox_results['cls_score'],
+                                   bbox_results['bbox_pred'], rois,
+                                   *bbox_targets)
+
+        bbox_results.update(
+            loss_bbox=loss_bbox,
+            rois=rois,
+            bbox_targets=bbox_targets,
+        )
+        return bbox_results
+
+    def _mask_forward_train(self,
+                            stage,
+                            x,
+                            sampling_results,
+                            gt_masks,
+                            rcnn_train_cfg,
+                            semantic_feat=None):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        pos_rois)
+
+        # semantic feature fusion
+        # element-wise sum for original features and pooled semantic features
+        if self.with_semantic and 'mask' in self.semantic_fusion:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             pos_rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats = mask_feats + mask_semantic_feat
+
+        # mask information flow
+        # forward all previous mask heads to obtain last_feat, and fuse it
+        # with the normal mask feature
+        if self.mask_info_flow:
+            last_feat = None
+            for i in range(stage):
+                last_feat = self.mask_head[i](
+                    mask_feats, last_feat, return_logits=False)
+            mask_pred = mask_head(mask_feats, last_feat, return_feat=False)
+        else:
+            mask_pred = mask_head(mask_feats, return_feat=False)
+
+        mask_targets = mask_head.get_targets(sampling_results, gt_masks,
+                                             rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels)
+
+        mask_results = dict(loss_mask=loss_mask)
+        return mask_results
+
+    def _bbox_forward(self, stage, x, rois, semantic_feat=None):
+        """Box head forward function used in both training and testing."""
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(
+            x[:len(bbox_roi_extractor.featmap_strides)], rois)
+        if self.with_semantic and 'bbox' in self.semantic_fusion:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats = bbox_feats + bbox_semantic_feat
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
+        return bbox_results
+
+    def _mask_forward_test(self, stage, x, bboxes, semantic_feat=None):
+        """Mask head forward function for testing."""
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_rois = bbox2roi([bboxes])
+        mask_feats = mask_roi_extractor(
+            x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
+        if self.with_semantic and 'mask' in self.semantic_fusion:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             mask_rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats = mask_feats + mask_semantic_feat
+        if self.mask_info_flow:
+            last_feat = None
+            last_pred = None
+            for i in range(stage):
+                mask_pred, last_feat = self.mask_head[i](mask_feats, last_feat)
+                if last_pred is not None:
+                    mask_pred = mask_pred + last_pred
+                last_pred = mask_pred
+            mask_pred = mask_head(mask_feats, last_feat, return_feat=False)
+            if last_pred is not None:
+                mask_pred = mask_pred + last_pred
+        else:
+            mask_pred = mask_head(mask_feats)
+        return mask_pred
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      gt_semantic_seg=None):
+        """
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+            proposal_list (list[Tensors]): list of region proposals.
+
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+
+            gt_labels (list[Tensor]): class indices corresponding to each box
+
+            gt_bboxes_ignore (None, list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+            gt_masks (None, Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+            gt_semantic_seg (None, list[Tensor]): semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # semantic segmentation part
+        # 2 outputs: segmentation prediction and embedded features
+        losses = dict()
+        if self.with_semantic:
+            semantic_pred, semantic_feat = self.semantic_head(x)
+            loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg)
+            losses['loss_semantic_seg'] = loss_seg
+        else:
+            semantic_feat = None
+
+        for i in range(self.num_stages):
+            self.current_stage = i
+            rcnn_train_cfg = self.train_cfg[i]
+            lw = self.stage_loss_weights[i]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[i]
+            bbox_sampler = self.bbox_sampler[i]
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+
+            for j in range(num_imgs):
+                assign_result = bbox_assigner.assign(proposal_list[j],
+                                                     gt_bboxes[j],
+                                                     gt_bboxes_ignore[j],
+                                                     gt_labels[j])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[j],
+                    gt_bboxes[j],
+                    gt_labels[j],
+                    feats=[lvl_feat[j][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = \
+                self._bbox_forward_train(
+                    i, x, sampling_results, gt_bboxes, gt_labels,
+                    rcnn_train_cfg, semantic_feat)
+            roi_labels = bbox_results['bbox_targets'][0]
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{i}.{name}'] = (
+                    value * lw if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                # interleaved execution: use regressed bboxes by the box branch
+                # to train the mask branch
+                if self.interleaved:
+                    pos_is_gts = [res.pos_is_gt for res in sampling_results]
+                    with torch.no_grad():
+                        proposal_list = self.bbox_head[i].refine_bboxes(
+                            bbox_results['rois'], roi_labels,
+                            bbox_results['bbox_pred'], pos_is_gts, img_metas)
+                        # re-assign and sample 512 RoIs from 512 RoIs
+                        sampling_results = []
+                        for j in range(num_imgs):
+                            assign_result = bbox_assigner.assign(
+                                proposal_list[j], gt_bboxes[j],
+                                gt_bboxes_ignore[j], gt_labels[j])
+                            sampling_result = bbox_sampler.sample(
+                                assign_result,
+                                proposal_list[j],
+                                gt_bboxes[j],
+                                gt_labels[j],
+                                feats=[lvl_feat[j][None] for lvl_feat in x])
+                            sampling_results.append(sampling_result)
+                mask_results = self._mask_forward_train(
+                    i, x, sampling_results, gt_masks, rcnn_train_cfg,
+                    semantic_feat)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{i}.{name}'] = (
+                        value * lw if 'loss' in name else value)
+
+            # refine bboxes (same as Cascade R-CNN)
+            if i < self.num_stages - 1 and not self.interleaved:
+                pos_is_gts = [res.pos_is_gt for res in sampling_results]
+                with torch.no_grad():
+                    proposal_list = self.bbox_head[i].refine_bboxes(
+                        bbox_results['rois'], roi_labels,
+                        bbox_results['bbox_pred'], pos_is_gts, img_metas)
+
+        return losses
+
+    def simple_test(self, x, proposal_list, img_metas, rescale=False):
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (batch_size, c, h, w).
+            proposal_list (list(Tensor)): Proposals from rpn head.
+                Each has shape (num_proposals, 5), last dimension
+                5 represent (x1, y1, x2, y2, score).
+            img_metas (list[dict]): Meta information of images.
+            rescale (bool): Whether to rescale the results to
+                the original image. Default: True.
+
+        Returns:
+            list[list[np.ndarray]] or list[tuple]: When no mask branch,
+            it is bbox results of each image and classes with type
+            `list[list[np.ndarray]]`. The outer list
+            corresponds to each image. The inner list
+            corresponds to each class. When the model has mask branch,
+            it contains bbox results and mask results.
+            The outer list corresponds to each image, and first element
+            of tuple is bbox results, second element is mask results.
+        """
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        num_imgs = len(proposal_list)
+        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        # "ms" in variable names means multi-stage
+        ms_bbox_result = {}
+        ms_segm_result = {}
+        ms_scores = []
+        rcnn_test_cfg = self.test_cfg
+
+        rois = bbox2roi(proposal_list)
+
+        if rois.shape[0] == 0:
+            # There is no proposal in the whole batch
+            bbox_results = [[
+                np.zeros((0, 5), dtype=np.float32)
+                for _ in range(self.bbox_head[-1].num_classes)
+            ]] * num_imgs
+
+            if self.with_mask:
+                mask_classes = self.mask_head[-1].num_classes
+                segm_results = [[[] for _ in range(mask_classes)]
+                                for _ in range(num_imgs)]
+                results = list(zip(bbox_results, segm_results))
+            else:
+                results = bbox_results
+
+            return results
+
+        for i in range(self.num_stages):
+            bbox_head = self.bbox_head[i]
+            bbox_results = self._bbox_forward(
+                i, x, rois, semantic_feat=semantic_feat)
+            # split batch bbox prediction back to each image
+            cls_score = bbox_results['cls_score']
+            bbox_pred = bbox_results['bbox_pred']
+            num_proposals_per_img = tuple(len(p) for p in proposal_list)
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_score = cls_score.split(num_proposals_per_img, 0)
+            bbox_pred = bbox_pred.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_score)
+
+            if i < self.num_stages - 1:
+                refine_rois_list = []
+                for j in range(num_imgs):
+                    if rois[j].shape[0] > 0:
+                        bbox_label = cls_score[j][:, :-1].argmax(dim=1)
+                        refine_rois = bbox_head.regress_by_class(
+                            rois[j], bbox_label, bbox_pred[j], img_metas[j])
+                        refine_rois_list.append(refine_rois)
+                rois = torch.cat(refine_rois_list)
+
+        # average scores of each image by stages
+        cls_score = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(num_imgs)
+        ]
+
+        # apply bbox post-processing to each image individually
+        det_bboxes = []
+        det_labels = []
+        for i in range(num_imgs):
+            det_bbox, det_label = self.bbox_head[-1].get_bboxes(
+                rois[i],
+                cls_score[i],
+                bbox_pred[i],
+                img_shapes[i],
+                scale_factors[i],
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+        bbox_result = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head[-1].num_classes)
+            for i in range(num_imgs)
+        ]
+        ms_bbox_result['ensemble'] = bbox_result
+
+        if self.with_mask:
+            if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+                mask_classes = self.mask_head[-1].num_classes
+                segm_results = [[[] for _ in range(mask_classes)]
+                                for _ in range(num_imgs)]
+            else:
+                if rescale and not isinstance(scale_factors[0], float):
+                    scale_factors = [
+                        torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                        for scale_factor in scale_factors
+                    ]
+                _bboxes = [
+                    det_bboxes[i][:, :4] *
+                    scale_factors[i] if rescale else det_bboxes[i]
+                    for i in range(num_imgs)
+                ]
+                mask_rois = bbox2roi(_bboxes)
+                aug_masks = []
+                mask_roi_extractor = self.mask_roi_extractor[-1]
+                mask_feats = mask_roi_extractor(
+                    x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
+                if self.with_semantic and 'mask' in self.semantic_fusion:
+                    mask_semantic_feat = self.semantic_roi_extractor(
+                        [semantic_feat], mask_rois)
+                    mask_feats = mask_feats + mask_semantic_feat
+                last_feat = None
+
+                num_bbox_per_img = tuple(len(_bbox) for _bbox in _bboxes)
+                for i in range(self.num_stages):
+                    mask_head = self.mask_head[i]
+                    if self.mask_info_flow:
+                        mask_pred, last_feat = mask_head(mask_feats, last_feat)
+                    else:
+                        mask_pred = mask_head(mask_feats)
+
+                    # split batch mask prediction back to each image
+                    mask_pred = mask_pred.split(num_bbox_per_img, 0)
+                    aug_masks.append(
+                        [mask.sigmoid().cpu().numpy() for mask in mask_pred])
+
+                # apply mask post-processing to each image individually
+                segm_results = []
+                for i in range(num_imgs):
+                    if det_bboxes[i].shape[0] == 0:
+                        segm_results.append(
+                            [[]
+                             for _ in range(self.mask_head[-1].num_classes)])
+                    else:
+                        aug_mask = [mask[i] for mask in aug_masks]
+                        merged_mask = merge_aug_masks(
+                            aug_mask, [[img_metas[i]]] * self.num_stages,
+                            rcnn_test_cfg)
+                        segm_result = self.mask_head[-1].get_seg_masks(
+                            merged_mask, _bboxes[i], det_labels[i],
+                            rcnn_test_cfg, ori_shapes[i], scale_factors[i],
+                            rescale)
+                        segm_results.append(segm_result)
+            ms_segm_result['ensemble'] = segm_results
+
+        if self.with_mask:
+            results = list(
+                zip(ms_bbox_result['ensemble'], ms_segm_result['ensemble']))
+        else:
+            results = ms_bbox_result['ensemble']
+
+        return results
+
+    def aug_test(self, img_feats, proposal_list, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        if self.with_semantic:
+            semantic_feats = [
+                self.semantic_head(feat)[1] for feat in img_feats
+            ]
+        else:
+            semantic_feats = [None] * len(img_metas)
+
+        rcnn_test_cfg = self.test_cfg
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta, semantic in zip(img_feats, img_metas, semantic_feats):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            # "ms" in variable names means multi-stage
+            ms_scores = []
+
+            rois = bbox2roi([proposals])
+
+            if rois.shape[0] == 0:
+                # There is no proposal in the single image
+                aug_bboxes.append(rois.new_zeros(0, 4))
+                aug_scores.append(rois.new_zeros(0, 1))
+                continue
+
+            for i in range(self.num_stages):
+                bbox_head = self.bbox_head[i]
+                bbox_results = self._bbox_forward(
+                    i, x, rois, semantic_feat=semantic)
+                ms_scores.append(bbox_results['cls_score'])
+
+                if i < self.num_stages - 1:
+                    bbox_label = bbox_results['cls_score'].argmax(dim=1)
+                    rois = bbox_head.regress_by_class(
+                        rois, bbox_label, bbox_results['bbox_pred'],
+                        img_meta[0])
+
+            cls_score = sum(ms_scores) / float(len(ms_scores))
+            bboxes, scores = self.bbox_head[-1].get_bboxes(
+                rois,
+                cls_score,
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+
+        bbox_result = bbox2result(det_bboxes, det_labels,
+                                  self.bbox_head[-1].num_classes)
+
+        if self.with_mask:
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[]
+                               for _ in range(self.mask_head[-1].num_classes)]
+            else:
+                aug_masks = []
+                aug_img_metas = []
+                for x, img_meta, semantic in zip(img_feats, img_metas,
+                                                 semantic_feats):
+                    img_shape = img_meta[0]['img_shape']
+                    scale_factor = img_meta[0]['scale_factor']
+                    flip = img_meta[0]['flip']
+                    flip_direction = img_meta[0]['flip_direction']
+                    _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                           scale_factor, flip, flip_direction)
+                    mask_rois = bbox2roi([_bboxes])
+                    mask_feats = self.mask_roi_extractor[-1](
+                        x[:len(self.mask_roi_extractor[-1].featmap_strides)],
+                        mask_rois)
+                    if self.with_semantic:
+                        semantic_feat = semantic
+                        mask_semantic_feat = self.semantic_roi_extractor(
+                            [semantic_feat], mask_rois)
+                        if mask_semantic_feat.shape[-2:] != mask_feats.shape[
+                                -2:]:
+                            mask_semantic_feat = F.adaptive_avg_pool2d(
+                                mask_semantic_feat, mask_feats.shape[-2:])
+                        mask_feats = mask_feats + mask_semantic_feat
+                    last_feat = None
+                    for i in range(self.num_stages):
+                        mask_head = self.mask_head[i]
+                        if self.mask_info_flow:
+                            mask_pred, last_feat = mask_head(
+                                mask_feats, last_feat)
+                        else:
+                            mask_pred = mask_head(mask_feats)
+                        aug_masks.append(mask_pred.sigmoid().cpu().numpy())
+                        aug_img_metas.append(img_meta)
+                merged_masks = merge_aug_masks(aug_masks, aug_img_metas,
+                                               self.test_cfg)
+
+                ori_shape = img_metas[0][0]['ori_shape']
+                segm_result = self.mask_head[-1].get_seg_masks(
+                    merged_masks,
+                    det_bboxes,
+                    det_labels,
+                    rcnn_test_cfg,
+                    ori_shape,
+                    scale_factor=1.0,
+                    rescale=False)
+            return [(bbox_result, segm_result)]
+        else:
+            return [bbox_result]
diff --git a/mmdet/models/roi_heads/mask_heads/__init__.py b/mmdet/models/roi_heads/mask_heads/__init__.py
new file mode 100755
index 0000000..48a5d42
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coarse_mask_head import CoarseMaskHead
+from .dynamic_mask_head import DynamicMaskHead
+from .fcn_mask_head import FCNMaskHead
+from .feature_relay_head import FeatureRelayHead
+from .fused_semantic_head import FusedSemanticHead
+from .global_context_head import GlobalContextHead
+from .grid_head import GridHead
+from .htc_mask_head import HTCMaskHead
+from .mask_point_head import MaskPointHead
+from .maskiou_head import MaskIoUHead
+from .scnet_mask_head import SCNetMaskHead
+from .scnet_semantic_head import SCNetSemanticHead
+
+__all__ = [
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'CoarseMaskHead', 'MaskPointHead', 'SCNetMaskHead',
+    'SCNetSemanticHead', 'GlobalContextHead', 'FeatureRelayHead',
+    'DynamicMaskHead'
+]
diff --git a/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py b/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
new file mode 100755
index 0000000..946254c
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule, Linear
+from mmcv.runner import ModuleList, auto_fp16
+
+from mmdet.models.builder import HEADS
+from .fcn_mask_head import FCNMaskHead
+
+
+@HEADS.register_module()
+class CoarseMaskHead(FCNMaskHead):
+    """Coarse mask head used in PointRend.
+
+    Compared with standard ``FCNMaskHead``, ``CoarseMaskHead`` will downsample
+    the input feature map instead of upsample it.
+
+    Args:
+        num_convs (int): Number of conv layers in the head. Default: 0.
+        num_fcs (int): Number of fc layers in the head. Default: 2.
+        fc_out_channels (int): Number of output channels of fc layer.
+            Default: 1024.
+        downsample_factor (int): The factor that feature map is downsampled by.
+            Default: 2.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs=0,
+                 num_fcs=2,
+                 fc_out_channels=1024,
+                 downsample_factor=2,
+                 init_cfg=dict(
+                     type='Xavier',
+                     override=[
+                         dict(name='fcs'),
+                         dict(type='Constant', val=0.001, name='fc_logits')
+                     ]),
+                 *arg,
+                 **kwarg):
+        super(CoarseMaskHead, self).__init__(
+            *arg,
+            num_convs=num_convs,
+            upsample_cfg=dict(type=None),
+            init_cfg=None,
+            **kwarg)
+        self.init_cfg = init_cfg
+        self.num_fcs = num_fcs
+        assert self.num_fcs > 0
+        self.fc_out_channels = fc_out_channels
+        self.downsample_factor = downsample_factor
+        assert self.downsample_factor >= 1
+        # remove conv_logit
+        delattr(self, 'conv_logits')
+
+        if downsample_factor > 1:
+            downsample_in_channels = (
+                self.conv_out_channels
+                if self.num_convs > 0 else self.in_channels)
+            self.downsample_conv = ConvModule(
+                downsample_in_channels,
+                self.conv_out_channels,
+                kernel_size=downsample_factor,
+                stride=downsample_factor,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        else:
+            self.downsample_conv = None
+
+        self.output_size = (self.roi_feat_size[0] // downsample_factor,
+                            self.roi_feat_size[1] // downsample_factor)
+        self.output_area = self.output_size[0] * self.output_size[1]
+
+        last_layer_dim = self.conv_out_channels * self.output_area
+
+        self.fcs = ModuleList()
+        for i in range(num_fcs):
+            fc_in_channels = (
+                last_layer_dim if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(fc_in_channels, self.fc_out_channels))
+        last_layer_dim = self.fc_out_channels
+        output_channels = self.num_classes * self.output_area
+        self.fc_logits = Linear(last_layer_dim, output_channels)
+
+    def init_weights(self):
+        super(FCNMaskHead, self).init_weights()
+
+    @auto_fp16()
+    def forward(self, x):
+        for conv in self.convs:
+            x = conv(x)
+
+        if self.downsample_conv is not None:
+            x = self.downsample_conv(x)
+
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_pred = self.fc_logits(x).view(
+            x.size(0), self.num_classes, *self.output_size)
+        return mask_pred
diff --git a/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py b/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
new file mode 100755
index 0000000..5bbe7ee
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.runner import auto_fp16, force_fp32
+
+from mmdet.core import mask_target
+from mmdet.models.builder import HEADS
+from mmdet.models.dense_heads.atss_head import reduce_mean
+from mmdet.models.utils import build_transformer
+from .fcn_mask_head import FCNMaskHead
+
+
+@HEADS.register_module()
+class DynamicMaskHead(FCNMaskHead):
+    r"""Dynamic Mask Head for
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_convs (int): Number of convolution layer.
+            Defaults to 4.
+        roi_feat_size (int): The output size of RoI extractor,
+            Defaults to 14.
+        in_channels (int): Input feature channels.
+            Defaults to 256.
+        conv_kernel_size (int): Kernel size of convolution layers.
+            Defaults to 3.
+        conv_out_channels (int): Output channels of convolution layers.
+            Defaults to 256.
+        num_classes (int): Number of classes.
+            Defaults to 80
+        class_agnostic (int): Whether generate class agnostic prediction.
+            Defaults to False.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        upsample_cfg (dict): The config for upsample layer.
+        conv_cfg (dict): The convolution layer config.
+        norm_cfg (dict): The norm layer config.
+        dynamic_conv_cfg (dict): The dynamic convolution layer config.
+        loss_mask (dict): The config for mask loss.
+    """
+
+    def __init__(self,
+                 num_convs=4,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_kernel_size=3,
+                 conv_out_channels=256,
+                 num_classes=80,
+                 class_agnostic=False,
+                 upsample_cfg=dict(type='deconv', scale_factor=2),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 dynamic_conv_cfg=dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=14,
+                     with_proj=False,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_mask=dict(type='DiceLoss', loss_weight=8.0),
+                 **kwargs):
+        super(DynamicMaskHead, self).__init__(
+            num_convs=num_convs,
+            roi_feat_size=roi_feat_size,
+            in_channels=in_channels,
+            conv_kernel_size=conv_kernel_size,
+            conv_out_channels=conv_out_channels,
+            num_classes=num_classes,
+            class_agnostic=class_agnostic,
+            upsample_cfg=upsample_cfg,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            loss_mask=loss_mask,
+            **kwargs)
+        assert class_agnostic is False, \
+            'DynamicMaskHead only support class_agnostic=False'
+        self.fp16_enabled = False
+
+        self.instance_interactive_conv = build_transformer(dynamic_conv_cfg)
+
+    def init_weights(self):
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            nn.init.constant_(self.conv_logits.bias, 0.)
+
+    @auto_fp16()
+    def forward(self, roi_feat, proposal_feat):
+        """Forward function of DynamicMaskHead.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size*num_proposals, feature_dimensions)
+
+          Returns:
+            mask_pred (Tensor): Predicted foreground masks with shape
+                (batch_size*num_proposals, num_classes,
+                                        pooling_h*2, pooling_w*2).
+        """
+
+        proposal_feat = proposal_feat.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+
+        x = proposal_feat_iic.permute(0, 2, 1).reshape(roi_feat.size())
+
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_pred = self.conv_logits(x)
+        return mask_pred
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, mask_targets, labels):
+        num_pos = labels.new_ones(labels.size()).float().sum()
+        avg_factor = torch.clamp(reduce_mean(num_pos), min=1.).item()
+        loss = dict()
+        if mask_pred.size(0) == 0:
+            loss_mask = mask_pred.sum()
+        else:
+            loss_mask = self.loss_mask(
+                mask_pred[torch.arange(num_pos).long(), labels, ...].sigmoid(),
+                mask_targets,
+                avg_factor=avg_factor)
+        loss['loss_mask'] = loss_mask
+        return loss
+
+    def get_targets(self, sampling_results, gt_masks, rcnn_train_cfg):
+
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, rcnn_train_cfg)
+        return mask_targets
diff --git a/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
new file mode 100755
index 0000000..355d882
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -0,0 +1,412 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from warnings import warn
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_conv_layer, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmcv.runner import BaseModule, ModuleList, auto_fp16, force_fp32
+from torch.nn.modules.utils import _pair
+
+from mmdet.core import mask_target
+from mmdet.models.builder import HEADS, build_loss
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+
+
+@HEADS.register_module()
+class FCNMaskHead(BaseModule):
+
+    def __init__(self,
+                 num_convs=4,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_kernel_size=3,
+                 conv_out_channels=256,
+                 num_classes=80,
+                 class_agnostic=False,
+                 upsample_cfg=dict(type='deconv', scale_factor=2),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 predictor_cfg=dict(type='Conv'),
+                 loss_mask=dict(
+                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(FCNMaskHead, self).__init__(init_cfg)
+        self.upsample_cfg = upsample_cfg.copy()
+        if self.upsample_cfg['type'] not in [
+                None, 'deconv', 'nearest', 'bilinear', 'carafe'
+        ]:
+            raise ValueError(
+                f'Invalid upsample method {self.upsample_cfg["type"]}, '
+                'accepted methods are "deconv", "nearest", "bilinear", '
+                '"carafe"')
+        self.num_convs = num_convs
+        # WARN: roi_feat_size is reserved and not used
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_out_channels = conv_out_channels
+        self.upsample_method = self.upsample_cfg.get('type')
+        self.scale_factor = self.upsample_cfg.pop('scale_factor', None)
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.predictor_cfg = predictor_cfg
+        self.fp16_enabled = False
+        self.loss_mask = build_loss(loss_mask)
+
+        self.convs = ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+        upsample_in_channels = (
+            self.conv_out_channels if self.num_convs > 0 else in_channels)
+        upsample_cfg_ = self.upsample_cfg.copy()
+        if self.upsample_method is None:
+            self.upsample = None
+        elif self.upsample_method == 'deconv':
+            upsample_cfg_.update(
+                in_channels=upsample_in_channels,
+                out_channels=self.conv_out_channels,
+                kernel_size=self.scale_factor,
+                stride=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        elif self.upsample_method == 'carafe':
+            upsample_cfg_.update(
+                channels=upsample_in_channels, scale_factor=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        else:
+            # suppress warnings
+            align_corners = (None
+                             if self.upsample_method == 'nearest' else False)
+            upsample_cfg_.update(
+                scale_factor=self.scale_factor,
+                mode=self.upsample_method,
+                align_corners=align_corners)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        logits_in_channel = (
+            self.conv_out_channels
+            if self.upsample_method == 'deconv' else upsample_in_channels)
+        self.conv_logits = build_conv_layer(self.predictor_cfg,
+                                            logits_in_channel, out_channels, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        super(FCNMaskHead, self).init_weights()
+        for m in [self.upsample, self.conv_logits]:
+            if m is None:
+                continue
+            elif isinstance(m, CARAFEPack):
+                m.init_weights()
+            elif hasattr(m, 'weight') and hasattr(m, 'bias'):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.constant_(m.bias, 0)
+
+    @auto_fp16()
+    def forward(self, x):
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_pred = self.conv_logits(x)
+        return mask_pred
+
+    def get_targets(self, sampling_results, gt_masks, rcnn_train_cfg):
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, rcnn_train_cfg)
+        return mask_targets
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, mask_targets, labels):
+        """
+        Example:
+            >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import *  # NOQA
+            >>> N = 7  # N = number of extracted ROIs
+            >>> C, H, W = 11, 32, 32
+            >>> # Create example instance of FCN Mask Head.
+            >>> # There are lots of variations depending on the configuration
+            >>> self = FCNMaskHead(num_classes=C, num_convs=1)
+            >>> inputs = torch.rand(N, self.in_channels, H, W)
+            >>> mask_pred = self.forward(inputs)
+            >>> sf = self.scale_factor
+            >>> labels = torch.randint(0, C, size=(N,))
+            >>> # With the default properties the mask targets should indicate
+            >>> # a (potentially soft) single-class label
+            >>> mask_targets = torch.rand(N, H * sf, W * sf)
+            >>> loss = self.loss(mask_pred, mask_targets, labels)
+            >>> print('loss = {!r}'.format(loss))
+        """
+        loss = dict()
+        if mask_pred.size(0) == 0:
+            loss_mask = mask_pred.sum()
+        else:
+            if self.class_agnostic:
+                loss_mask = self.loss_mask(mask_pred, mask_targets,
+                                           torch.zeros_like(labels))
+            else:
+                loss_mask = self.loss_mask(mask_pred, mask_targets, labels)
+        loss['loss_mask'] = loss_mask
+        return loss
+
+    def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
+                      ori_shape, scale_factor, rescale):
+        """Get segmentation masks from mask_pred and bboxes.
+
+        Args:
+            mask_pred (Tensor or ndarray): shape (n, #class, h, w).
+                For single-scale testing, mask_pred is the direct output of
+                model, whose type is Tensor, while for multi-scale testing,
+                it will be converted to numpy array outside of this method.
+            det_bboxes (Tensor): shape (n, 4/5)
+            det_labels (Tensor): shape (n, )
+            rcnn_test_cfg (dict): rcnn testing config
+            ori_shape (Tuple): original image height and width, shape (2,)
+            scale_factor(ndarray | Tensor): If ``rescale is True``, box
+                coordinates are divided by this scale factor to fit
+                ``ori_shape``.
+            rescale (bool): If True, the resulting masks will be rescaled to
+                ``ori_shape``.
+
+        Returns:
+            list[list]: encoded masks. The c-th item in the outer list
+                corresponds to the c-th class. Given the c-th outer list, the
+                i-th item in that inner list is the mask for the i-th box with
+                class label c.
+
+        Example:
+            >>> import mmcv
+            >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import *  # NOQA
+            >>> N = 7  # N = number of extracted ROIs
+            >>> C, H, W = 11, 32, 32
+            >>> # Create example instance of FCN Mask Head.
+            >>> self = FCNMaskHead(num_classes=C, num_convs=0)
+            >>> inputs = torch.rand(N, self.in_channels, H, W)
+            >>> mask_pred = self.forward(inputs)
+            >>> # Each input is associated with some bounding box
+            >>> det_bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N)
+            >>> det_labels = torch.randint(0, C, size=(N,))
+            >>> rcnn_test_cfg = mmcv.Config({'mask_thr_binary': 0, })
+            >>> ori_shape = (H * 4, W * 4)
+            >>> scale_factor = torch.FloatTensor((1, 1))
+            >>> rescale = False
+            >>> # Encoded masks are a list for each category.
+            >>> encoded_masks = self.get_seg_masks(
+            >>>     mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape,
+            >>>     scale_factor, rescale
+            >>> )
+            >>> assert len(encoded_masks) == C
+            >>> assert sum(list(map(len, encoded_masks))) == N
+        """
+        if isinstance(mask_pred, torch.Tensor):
+            mask_pred = mask_pred.sigmoid()
+        else:
+            # In AugTest, has been activated before
+            mask_pred = det_bboxes.new_tensor(mask_pred)
+
+        device = mask_pred.device
+        cls_segms = [[] for _ in range(self.num_classes)
+                     ]  # BG is not included in num_classes
+        bboxes = det_bboxes[:, :4]
+        labels = det_labels
+
+        # In most cases, scale_factor should have been
+        # converted to Tensor when rescale the bbox
+        if not isinstance(scale_factor, torch.Tensor):
+            if isinstance(scale_factor, float):
+                scale_factor = np.array([scale_factor] * 4)
+                warn('Scale_factor should be a Tensor or ndarray '
+                     'with shape (4,), float would be deprecated. ')
+            assert isinstance(scale_factor, np.ndarray)
+            scale_factor = torch.Tensor(scale_factor)
+
+        if rescale:
+            img_h, img_w = ori_shape[:2]
+            bboxes = bboxes / scale_factor.to(bboxes)
+        else:
+            w_scale, h_scale = scale_factor[0], scale_factor[1]
+            img_h = np.round(ori_shape[0] * h_scale.item()).astype(np.int32)
+            img_w = np.round(ori_shape[1] * w_scale.item()).astype(np.int32)
+
+        N = len(mask_pred)
+        # The actual implementation split the input into chunks,
+        # and paste them chunk by chunk.
+        if device.type == 'cpu':
+            # CPU is most efficient when they are pasted one by one with
+            # skip_empty=True, so that it performs minimal number of
+            # operations.
+            num_chunks = N
+        else:
+            # GPU benefits from parallelism for larger chunks,
+            # but may have memory issue
+            # the types of img_w and img_h are np.int32,
+            # when the image resolution is large,
+            # the calculation of num_chunks will overflow.
+            # so we need to change the types of img_w and img_h to int.
+            # See https://github.com/open-mmlab/mmdetection/pull/5191
+            num_chunks = int(
+                np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT /
+                        GPU_MEM_LIMIT))
+            assert (num_chunks <=
+                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
+        chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+        threshold = rcnn_test_cfg.mask_thr_binary
+        im_mask = torch.zeros(
+            N,
+            img_h,
+            img_w,
+            device=device,
+            dtype=torch.bool if threshold >= 0 else torch.uint8)
+
+        if not self.class_agnostic:
+            mask_pred = mask_pred[range(N), labels][:, None]
+
+        for inds in chunks:
+            masks_chunk, spatial_inds = _do_paste_mask(
+                mask_pred[inds],
+                bboxes[inds],
+                img_h,
+                img_w,
+                skip_empty=device.type == 'cpu')
+
+            if threshold >= 0:
+                masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+            else:
+                # for visualization and debugging
+                masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+            im_mask[(inds, ) + spatial_inds] = masks_chunk
+
+        for i in range(N):
+            cls_segms[labels[i]].append(im_mask[i].detach().cpu().numpy())
+        return cls_segms
+
+    def onnx_export(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
+                    ori_shape, **kwargs):
+        """Get segmentation masks from mask_pred and bboxes.
+
+        Args:
+            mask_pred (Tensor): shape (n, #class, h, w).
+            det_bboxes (Tensor): shape (n, 4/5)
+            det_labels (Tensor): shape (n, )
+            rcnn_test_cfg (dict): rcnn testing config
+            ori_shape (Tuple): original image height and width, shape (2,)
+
+        Returns:
+            Tensor: a mask of shape (N, img_h, img_w).
+        """
+
+        mask_pred = mask_pred.sigmoid()
+        bboxes = det_bboxes[:, :4]
+        labels = det_labels
+        # No need to consider rescale and scale_factor while exporting to ONNX
+        img_h, img_w = ori_shape[:2]
+        threshold = rcnn_test_cfg.mask_thr_binary
+        if not self.class_agnostic:
+            box_inds = torch.arange(mask_pred.shape[0])
+            mask_pred = mask_pred[box_inds, labels][:, None]
+        masks, _ = _do_paste_mask(
+            mask_pred, bboxes, img_h, img_w, skip_empty=False)
+        if threshold >= 0:
+            # should convert to float to avoid problems in TRT
+            masks = (masks >= threshold).to(dtype=torch.float)
+        return masks
+
+
+def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
+    """Paste instance masks according to boxes.
+
+    This implementation is modified from
+    https://github.com/facebookresearch/detectron2/
+
+    Args:
+        masks (Tensor): N, 1, H, W
+        boxes (Tensor): N, 4
+        img_h (int): Height of the image to be pasted.
+        img_w (int): Width of the image to be pasted.
+        skip_empty (bool): Only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        tuple: (Tensor, tuple). The first item is mask tensor, the second one
+            is the slice object.
+        If skip_empty == False, the whole image will be pasted. It will
+            return a mask of shape (N, img_h, img_w) and an empty tuple.
+        If skip_empty == True, only area around the mask will be pasted.
+            A mask of shape (N, h', w') and its start and end coordinates
+            in the original image will be returned.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(
+            boxes.min(dim=0).values.floor()[:2] - 1,
+            min=0).to(dtype=torch.int32)
+        x1_int = torch.clamp(
+            boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(
+            boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device).to(torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device).to(torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+    # IsInf op is not supported with ONNX<=1.7.0
+    if not torch.onnx.is_in_onnx_export():
+        if torch.isinf(img_x).any():
+            inds = torch.where(torch.isinf(img_x))
+            img_x[inds] = 0
+        if torch.isinf(img_y).any():
+            inds = torch.where(torch.isinf(img_y))
+            img_y[inds] = 0
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    img_masks = F.grid_sample(
+        masks.to(dtype=torch.float32), grid, align_corners=False)
+
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
diff --git a/mmdet/models/roi_heads/mask_heads/feature_relay_head.py b/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
new file mode 100755
index 0000000..452f37a
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.runner import BaseModule, auto_fp16
+
+from mmdet.models.builder import HEADS
+
+
+@HEADS.register_module()
+class FeatureRelayHead(BaseModule):
+    """Feature Relay Head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        in_channels (int, optional): number of input channels. Default: 256.
+        conv_out_channels (int, optional): number of output channels before
+            classification layer. Default: 256.
+        roi_feat_size (int, optional): roi feat size at box head. Default: 7.
+        scale_factor (int, optional): scale factor to match roi feat size
+            at mask head. Default: 2.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels=1024,
+                 out_conv_channels=256,
+                 roi_feat_size=7,
+                 scale_factor=2,
+                 init_cfg=dict(type='Kaiming', layer='Linear')):
+        super(FeatureRelayHead, self).__init__(init_cfg)
+        assert isinstance(roi_feat_size, int)
+
+        self.in_channels = in_channels
+        self.out_conv_channels = out_conv_channels
+        self.roi_feat_size = roi_feat_size
+        self.out_channels = (roi_feat_size**2) * out_conv_channels
+        self.scale_factor = scale_factor
+        self.fp16_enabled = False
+
+        self.fc = nn.Linear(self.in_channels, self.out_channels)
+        self.upsample = nn.Upsample(
+            scale_factor=scale_factor, mode='bilinear', align_corners=True)
+
+    @auto_fp16()
+    def forward(self, x):
+        """Forward function."""
+        N, in_C = x.shape
+        if N > 0:
+            out_C = self.out_conv_channels
+            out_HW = self.roi_feat_size
+            x = self.fc(x)
+            x = x.reshape(N, out_C, out_HW, out_HW)
+            x = self.upsample(x)
+            return x
+        return None
diff --git a/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py b/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
new file mode 100755
index 0000000..c6eaa54
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+
+from mmdet.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class FusedSemanticHead(BaseModule):
+    r"""Multi-level fused semantic segmentation head.
+
+    .. code-block:: none
+
+        in_1 -> 1x1 conv ---
+                            |
+        in_2 -> 1x1 conv -- |
+                           ||
+        in_3 -> 1x1 conv - ||
+                          |||                  /-> 1x1 conv (mask prediction)
+        in_4 -> 1x1 conv -----> 3x3 convs (*4)
+                            |                  \-> 1x1 conv (feature)
+        in_5 -> 1x1 conv ---
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_ins,
+                 fusion_level,
+                 num_convs=4,
+                 in_channels=256,
+                 conv_out_channels=256,
+                 num_classes=183,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 ignore_label=None,
+                 loss_weight=None,
+                 loss_seg=dict(
+                     type='CrossEntropyLoss',
+                     ignore_index=255,
+                     loss_weight=0.2),
+                 init_cfg=dict(
+                     type='Kaiming', override=dict(name='conv_logits'))):
+        super(FusedSemanticHead, self).__init__(init_cfg)
+        self.num_ins = num_ins
+        self.fusion_level = fusion_level
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            self.lateral_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False))
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = self.in_channels if i == 0 else conv_out_channels
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    conv_out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_embedding = ConvModule(
+            conv_out_channels,
+            conv_out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1)
+        if ignore_label:
+            loss_seg['ignore_index'] = ignore_label
+        if loss_weight:
+            loss_seg['loss_weight'] = loss_weight
+        if ignore_label or loss_weight:
+            warnings.warn('``ignore_label`` and ``loss_weight`` would be '
+                          'deprecated soon. Please set ``ingore_index`` and '
+                          '``loss_weight`` in ``loss_seg`` instead.')
+        self.criterion = build_loss(loss_seg)
+
+    @auto_fp16()
+    def forward(self, feats):
+        x = self.lateral_convs[self.fusion_level](feats[self.fusion_level])
+        fused_size = tuple(x.shape[-2:])
+        for i, feat in enumerate(feats):
+            if i != self.fusion_level:
+                feat = F.interpolate(
+                    feat, size=fused_size, mode='bilinear', align_corners=True)
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                x = x + self.lateral_convs[i](feat)
+
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+
+        mask_pred = self.conv_logits(x)
+        x = self.conv_embedding(x)
+        return mask_pred, x
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, labels):
+        labels = labels.squeeze(1).long()
+        loss_semantic_seg = self.criterion(mask_pred, labels)
+        return loss_semantic_seg
diff --git a/mmdet/models/roi_heads/mask_heads/global_context_head.py b/mmdet/models/roi_heads/mask_heads/global_context_head.py
new file mode 100755
index 0000000..af76a17
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/global_context_head.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+
+from mmdet.models.builder import HEADS
+from mmdet.models.utils import ResLayer, SimplifiedBasicBlock
+
+
+@HEADS.register_module()
+class GlobalContextHead(BaseModule):
+    """Global context head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_convs (int, optional): number of convolutional layer in GlbCtxHead.
+            Default: 4.
+        in_channels (int, optional): number of input channels. Default: 256.
+        conv_out_channels (int, optional): number of output channels before
+            classification layer. Default: 256.
+        num_classes (int, optional): number of classes. Default: 80.
+        loss_weight (float, optional): global context loss weight. Default: 1.
+        conv_cfg (dict, optional): config to init conv layer. Default: None.
+        norm_cfg (dict, optional): config to init norm layer. Default: None.
+        conv_to_res (bool, optional): if True, 2 convs will be grouped into
+            1 `SimplifiedBasicBlock` using a skip connection. Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs=4,
+                 in_channels=256,
+                 conv_out_channels=256,
+                 num_classes=80,
+                 loss_weight=1.0,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 conv_to_res=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='fc'))):
+        super(GlobalContextHead, self).__init__(init_cfg)
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.loss_weight = loss_weight
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.conv_to_res = conv_to_res
+        self.fp16_enabled = False
+
+        if self.conv_to_res:
+            num_res_blocks = num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
+        else:
+            self.convs = nn.ModuleList()
+            for i in range(self.num_convs):
+                in_channels = self.in_channels if i == 0 else conv_out_channels
+                self.convs.append(
+                    ConvModule(
+                        in_channels,
+                        conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(conv_out_channels, num_classes)
+
+        self.criterion = nn.BCEWithLogitsLoss()
+
+    @auto_fp16()
+    def forward(self, feats):
+        """Forward function."""
+        x = feats[-1]
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+        x = self.pool(x)
+
+        # multi-class prediction
+        mc_pred = x.reshape(x.size(0), -1)
+        mc_pred = self.fc(mc_pred)
+
+        return mc_pred, x
+
+    @force_fp32(apply_to=('pred', ))
+    def loss(self, pred, labels):
+        """Loss function."""
+        labels = [lbl.unique() for lbl in labels]
+        targets = pred.new_zeros(pred.size())
+        for i, label in enumerate(labels):
+            targets[i, label] = 1.0
+        loss = self.loss_weight * self.criterion(pred, targets)
+        return loss
diff --git a/mmdet/models/roi_heads/mask_heads/grid_head.py b/mmdet/models/roi_heads/mask_heads/grid_head.py
new file mode 100755
index 0000000..0c0702d
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/grid_head.py
@@ -0,0 +1,363 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmdet.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class GridHead(BaseModule):
+
+    def __init__(self,
+                 grid_points=9,
+                 num_convs=8,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_kernel_size=3,
+                 point_feat_channels=64,
+                 deconv_kernel_size=4,
+                 class_agnostic=False,
+                 loss_grid=dict(
+                     type='CrossEntropyLoss', use_sigmoid=True,
+                     loss_weight=15),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=36),
+                 init_cfg=[
+                     dict(type='Kaiming', layer=['Conv2d', 'Linear']),
+                     dict(
+                         type='Normal',
+                         layer='ConvTranspose2d',
+                         std=0.001,
+                         override=dict(
+                             type='Normal',
+                             name='deconv2',
+                             std=0.001,
+                             bias=-np.log(0.99 / 0.01)))
+                 ]):
+        super(GridHead, self).__init__(init_cfg)
+        self.grid_points = grid_points
+        self.num_convs = num_convs
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.point_feat_channels = point_feat_channels
+        self.conv_out_channels = self.point_feat_channels * self.grid_points
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        if isinstance(norm_cfg, dict) and norm_cfg['type'] == 'GN':
+            assert self.conv_out_channels % norm_cfg['num_groups'] == 0
+
+        assert self.grid_points >= 4
+        self.grid_size = int(np.sqrt(self.grid_points))
+        if self.grid_size * self.grid_size != self.grid_points:
+            raise ValueError('grid_points must be a square number')
+
+        # the predicted heatmap is half of whole_map_size
+        if not isinstance(self.roi_feat_size, int):
+            raise ValueError('Only square RoIs are supporeted in Grid R-CNN')
+        self.whole_map_size = self.roi_feat_size * 4
+
+        # compute point-wise sub-regions
+        self.sub_regions = self.calc_sub_regions()
+
+        self.convs = []
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            stride = 2 if i == 0 else 1
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=True))
+        self.convs = nn.Sequential(*self.convs)
+
+        self.deconv1 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            self.conv_out_channels,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+        self.norm1 = nn.GroupNorm(grid_points, self.conv_out_channels)
+        self.deconv2 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            grid_points,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+
+        # find the 4-neighbor of each grid point
+        self.neighbor_points = []
+        grid_size = self.grid_size
+        for i in range(grid_size):  # i-th column
+            for j in range(grid_size):  # j-th row
+                neighbors = []
+                if i > 0:  # left: (i - 1, j)
+                    neighbors.append((i - 1) * grid_size + j)
+                if j > 0:  # up: (i, j - 1)
+                    neighbors.append(i * grid_size + j - 1)
+                if j < grid_size - 1:  # down: (i, j + 1)
+                    neighbors.append(i * grid_size + j + 1)
+                if i < grid_size - 1:  # right: (i + 1, j)
+                    neighbors.append((i + 1) * grid_size + j)
+                self.neighbor_points.append(tuple(neighbors))
+        # total edges in the grid
+        self.num_edges = sum([len(p) for p in self.neighbor_points])
+
+        self.forder_trans = nn.ModuleList()  # first-order feature transition
+        self.sorder_trans = nn.ModuleList()  # second-order feature transition
+        for neighbors in self.neighbor_points:
+            fo_trans = nn.ModuleList()
+            so_trans = nn.ModuleList()
+            for _ in range(len(neighbors)):
+                # each transition module consists of a 5x5 depth-wise conv and
+                # 1x1 conv.
+                fo_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            stride=1,
+                            padding=2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+                so_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            1,
+                            2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+            self.forder_trans.append(fo_trans)
+            self.sorder_trans.append(so_trans)
+
+        self.loss_grid = build_loss(loss_grid)
+
+    def forward(self, x):
+        assert x.shape[-1] == x.shape[-2] == self.roi_feat_size
+        # RoI feature transformation, downsample 2x
+        x = self.convs(x)
+
+        c = self.point_feat_channels
+        # first-order fusion
+        x_fo = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_fo[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_fo[i] = x_fo[i] + self.forder_trans[i][j](
+                    x[:, point_idx * c:(point_idx + 1) * c])
+
+        # second-order fusion
+        x_so = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_so[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_so[i] = x_so[i] + self.sorder_trans[i][j](x_fo[point_idx])
+
+        # predicted heatmap with fused features
+        x2 = torch.cat(x_so, dim=1)
+        x2 = self.deconv1(x2)
+        x2 = F.relu(self.norm1(x2), inplace=True)
+        heatmap = self.deconv2(x2)
+
+        # predicted heatmap with original features (applicable during training)
+        if self.training:
+            x1 = x
+            x1 = self.deconv1(x1)
+            x1 = F.relu(self.norm1(x1), inplace=True)
+            heatmap_unfused = self.deconv2(x1)
+        else:
+            heatmap_unfused = heatmap
+
+        return dict(fused=heatmap, unfused=heatmap_unfused)
+
+    def calc_sub_regions(self):
+        """Compute point specific representation regions.
+
+        See Grid R-CNN Plus (https://arxiv.org/abs/1906.05688) for details.
+        """
+        # to make it consistent with the original implementation, half_size
+        # is computed as 2 * quarter_size, which is smaller
+        half_size = self.whole_map_size // 4 * 2
+        sub_regions = []
+        for i in range(self.grid_points):
+            x_idx = i // self.grid_size
+            y_idx = i % self.grid_size
+            if x_idx == 0:
+                sub_x1 = 0
+            elif x_idx == self.grid_size - 1:
+                sub_x1 = half_size
+            else:
+                ratio = x_idx / (self.grid_size - 1) - 0.25
+                sub_x1 = max(int(ratio * self.whole_map_size), 0)
+
+            if y_idx == 0:
+                sub_y1 = 0
+            elif y_idx == self.grid_size - 1:
+                sub_y1 = half_size
+            else:
+                ratio = y_idx / (self.grid_size - 1) - 0.25
+                sub_y1 = max(int(ratio * self.whole_map_size), 0)
+            sub_regions.append(
+                (sub_x1, sub_y1, sub_x1 + half_size, sub_y1 + half_size))
+        return sub_regions
+
+    def get_targets(self, sampling_results, rcnn_train_cfg):
+        # mix all samples (across images) together.
+        pos_bboxes = torch.cat([res.pos_bboxes for res in sampling_results],
+                               dim=0).cpu()
+        pos_gt_bboxes = torch.cat(
+            [res.pos_gt_bboxes for res in sampling_results], dim=0).cpu()
+        assert pos_bboxes.shape == pos_gt_bboxes.shape
+
+        # expand pos_bboxes to 2x of original size
+        x1 = pos_bboxes[:, 0] - (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y1 = pos_bboxes[:, 1] - (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        x2 = pos_bboxes[:, 2] + (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y2 = pos_bboxes[:, 3] + (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        pos_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        pos_bbox_ws = (pos_bboxes[:, 2] - pos_bboxes[:, 0]).unsqueeze(-1)
+        pos_bbox_hs = (pos_bboxes[:, 3] - pos_bboxes[:, 1]).unsqueeze(-1)
+
+        num_rois = pos_bboxes.shape[0]
+        map_size = self.whole_map_size
+        # this is not the final target shape
+        targets = torch.zeros((num_rois, self.grid_points, map_size, map_size),
+                              dtype=torch.float)
+
+        # pre-compute interpolation factors for all grid points.
+        # the first item is the factor of x-dim, and the second is y-dim.
+        # for a 9-point grid, factors are like (1, 0), (0.5, 0.5), (0, 1)
+        factors = []
+        for j in range(self.grid_points):
+            x_idx = j // self.grid_size
+            y_idx = j % self.grid_size
+            factors.append((1 - x_idx / (self.grid_size - 1),
+                            1 - y_idx / (self.grid_size - 1)))
+
+        radius = rcnn_train_cfg.pos_radius
+        radius2 = radius**2
+        for i in range(num_rois):
+            # ignore small bboxes
+            if (pos_bbox_ws[i] <= self.grid_size
+                    or pos_bbox_hs[i] <= self.grid_size):
+                continue
+            # for each grid point, mark a small circle as positive
+            for j in range(self.grid_points):
+                factor_x, factor_y = factors[j]
+                gridpoint_x = factor_x * pos_gt_bboxes[i, 0] + (
+                    1 - factor_x) * pos_gt_bboxes[i, 2]
+                gridpoint_y = factor_y * pos_gt_bboxes[i, 1] + (
+                    1 - factor_y) * pos_gt_bboxes[i, 3]
+
+                cx = int((gridpoint_x - pos_bboxes[i, 0]) / pos_bbox_ws[i] *
+                         map_size)
+                cy = int((gridpoint_y - pos_bboxes[i, 1]) / pos_bbox_hs[i] *
+                         map_size)
+
+                for x in range(cx - radius, cx + radius + 1):
+                    for y in range(cy - radius, cy + radius + 1):
+                        if x >= 0 and x < map_size and y >= 0 and y < map_size:
+                            if (x - cx)**2 + (y - cy)**2 <= radius2:
+                                targets[i, j, y, x] = 1
+        # reduce the target heatmap size by a half
+        # proposed in Grid R-CNN Plus (https://arxiv.org/abs/1906.05688).
+        sub_targets = []
+        for i in range(self.grid_points):
+            sub_x1, sub_y1, sub_x2, sub_y2 = self.sub_regions[i]
+            sub_targets.append(targets[:, [i], sub_y1:sub_y2, sub_x1:sub_x2])
+        sub_targets = torch.cat(sub_targets, dim=1)
+        sub_targets = sub_targets.to(sampling_results[0].pos_bboxes.device)
+        return sub_targets
+
+    def loss(self, grid_pred, grid_targets):
+        loss_fused = self.loss_grid(grid_pred['fused'], grid_targets)
+        loss_unfused = self.loss_grid(grid_pred['unfused'], grid_targets)
+        loss_grid = loss_fused + loss_unfused
+        return dict(loss_grid=loss_grid)
+
+    def get_bboxes(self, det_bboxes, grid_pred, img_metas):
+        # TODO: refactoring
+        assert det_bboxes.shape[0] == grid_pred.shape[0]
+        det_bboxes = det_bboxes.cpu()
+        cls_scores = det_bboxes[:, [4]]
+        det_bboxes = det_bboxes[:, :4]
+        grid_pred = grid_pred.sigmoid().cpu()
+
+        R, c, h, w = grid_pred.shape
+        half_size = self.whole_map_size // 4 * 2
+        assert h == w == half_size
+        assert c == self.grid_points
+
+        # find the point with max scores in the half-sized heatmap
+        grid_pred = grid_pred.view(R * c, h * w)
+        pred_scores, pred_position = grid_pred.max(dim=1)
+        xs = pred_position % w
+        ys = pred_position // w
+
+        # get the position in the whole heatmap instead of half-sized heatmap
+        for i in range(self.grid_points):
+            xs[i::self.grid_points] += self.sub_regions[i][0]
+            ys[i::self.grid_points] += self.sub_regions[i][1]
+
+        # reshape to (num_rois, grid_points)
+        pred_scores, xs, ys = tuple(
+            map(lambda x: x.view(R, c), [pred_scores, xs, ys]))
+
+        # get expanded pos_bboxes
+        widths = (det_bboxes[:, 2] - det_bboxes[:, 0]).unsqueeze(-1)
+        heights = (det_bboxes[:, 3] - det_bboxes[:, 1]).unsqueeze(-1)
+        x1 = (det_bboxes[:, 0, None] - widths / 2)
+        y1 = (det_bboxes[:, 1, None] - heights / 2)
+        # map the grid point to the absolute coordinates
+        abs_xs = (xs.float() + 0.5) / w * widths + x1
+        abs_ys = (ys.float() + 0.5) / h * heights + y1
+
+        # get the grid points indices that fall on the bbox boundaries
+        x1_inds = [i for i in range(self.grid_size)]
+        y1_inds = [i * self.grid_size for i in range(self.grid_size)]
+        x2_inds = [
+            self.grid_points - self.grid_size + i
+            for i in range(self.grid_size)
+        ]
+        y2_inds = [(i + 1) * self.grid_size - 1 for i in range(self.grid_size)]
+
+        # voting of all grid points on some boundary
+        bboxes_x1 = (abs_xs[:, x1_inds] * pred_scores[:, x1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x1_inds].sum(dim=1, keepdim=True))
+        bboxes_y1 = (abs_ys[:, y1_inds] * pred_scores[:, y1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y1_inds].sum(dim=1, keepdim=True))
+        bboxes_x2 = (abs_xs[:, x2_inds] * pred_scores[:, x2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x2_inds].sum(dim=1, keepdim=True))
+        bboxes_y2 = (abs_ys[:, y2_inds] * pred_scores[:, y2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y2_inds].sum(dim=1, keepdim=True))
+
+        bbox_res = torch.cat(
+            [bboxes_x1, bboxes_y1, bboxes_x2, bboxes_y2, cls_scores], dim=1)
+        bbox_res[:, [0, 2]].clamp_(min=0, max=img_metas[0]['img_shape'][1])
+        bbox_res[:, [1, 3]].clamp_(min=0, max=img_metas[0]['img_shape'][0])
+
+        return bbox_res
diff --git a/mmdet/models/roi_heads/mask_heads/htc_mask_head.py b/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
new file mode 100755
index 0000000..7ad8592
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+
+from mmdet.models.builder import HEADS
+from .fcn_mask_head import FCNMaskHead
+
+
+@HEADS.register_module()
+class HTCMaskHead(FCNMaskHead):
+
+    def __init__(self, with_conv_res=True, *args, **kwargs):
+        super(HTCMaskHead, self).__init__(*args, **kwargs)
+        self.with_conv_res = with_conv_res
+        if self.with_conv_res:
+            self.conv_res = ConvModule(
+                self.conv_out_channels,
+                self.conv_out_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self, x, res_feat=None, return_logits=True, return_feat=True):
+        if res_feat is not None:
+            assert self.with_conv_res
+            res_feat = self.conv_res(res_feat)
+            x = x + res_feat
+        for conv in self.convs:
+            x = conv(x)
+        res_feat = x
+        outs = []
+        if return_logits:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+            mask_pred = self.conv_logits(x)
+            outs.append(mask_pred)
+        if return_feat:
+            outs.append(res_feat)
+        return outs if len(outs) > 1 else outs[0]
diff --git a/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
new file mode 100755
index 0000000..c77c46d
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -0,0 +1,253 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+from mmcv.runner import BaseModule
+
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.utils import (get_uncertain_point_coords_with_randomness,
+                                get_uncertainty)
+
+
+@HEADS.register_module()
+class MaskPointHead(BaseModule):
+    """A mask point head use in PointRend.
+
+    ``MaskPointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Default: 3.
+        in_channels (int): Number of input channels. Default: 256.
+        fc_channels (int): Number of fc channels. Default: 256.
+        num_classes (int): Number of classes for logits. Default: 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Default: False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Default: True.
+        conv_cfg (dict | None): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d'))
+        norm_cfg (dict | None): Dictionary to construct and config norm layer.
+            Default: None.
+        loss_point (dict): Dictionary to construct and config loss layer of
+            point head. Default: dict(type='CrossEntropyLoss', use_mask=True,
+            loss_weight=1.0).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 num_fcs=3,
+                 in_channels=256,
+                 fc_channels=256,
+                 class_agnostic=False,
+                 coarse_pred_each_layer=True,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 loss_point=dict(
+                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+                 init_cfg=dict(
+                     type='Normal', std=0.001,
+                     override=dict(name='fc_logits'))):
+        super().__init__(init_cfg)
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channels = fc_channels
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_point = build_loss(loss_point)
+
+        fc_in_channels = in_channels + num_classes
+        self.fcs = nn.ModuleList()
+        for _ in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += num_classes if self.coarse_pred_each_layer else 0
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        self.fc_logits = nn.Conv1d(
+            fc_in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, fine_grained_feats, coarse_feats):
+        """Classify each point base on fine grained and coarse feats.
+
+        Args:
+            fine_grained_feats (Tensor): Fine grained feature sampled from FPN,
+                shape (num_rois, in_channels, num_points).
+            coarse_feats (Tensor): Coarse feature sampled from CoarseMaskHead,
+                shape (num_rois, num_classes, num_points).
+
+        Returns:
+            Tensor: Point classification results,
+                shape (num_rois, num_class, num_points).
+        """
+
+        x = torch.cat([fine_grained_feats, coarse_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_feats), dim=1)
+        return self.fc_logits(x)
+
+    def get_targets(self, rois, rel_roi_points, sampling_results, gt_masks,
+                    cfg):
+        """Get training targets of MaskPointHead for all images.
+
+        Args:
+            rois (Tensor): Region of Interest, shape (num_rois, 5).
+            rel_roi_points: Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+            sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            gt_masks (Tensor) : Ground truth segmentation masks of
+                corresponding boxes, shape (num_rois, height, width).
+            cfg (dict): Training cfg.
+
+        Returns:
+            Tensor: Point target, shape (num_rois, num_points).
+        """
+
+        num_imgs = len(sampling_results)
+        rois_list = []
+        rel_roi_points_list = []
+        for batch_ind in range(num_imgs):
+            inds = (rois[:, 0] == batch_ind)
+            rois_list.append(rois[inds])
+            rel_roi_points_list.append(rel_roi_points[inds])
+        pos_assigned_gt_inds_list = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        cfg_list = [cfg for _ in range(num_imgs)]
+
+        point_targets = map(self._get_target_single, rois_list,
+                            rel_roi_points_list, pos_assigned_gt_inds_list,
+                            gt_masks, cfg_list)
+        point_targets = list(point_targets)
+
+        if len(point_targets) > 0:
+            point_targets = torch.cat(point_targets)
+
+        return point_targets
+
+    def _get_target_single(self, rois, rel_roi_points, pos_assigned_gt_inds,
+                           gt_masks, cfg):
+        """Get training target of MaskPointHead for each image."""
+        num_pos = rois.size(0)
+        num_points = cfg.num_points
+        if num_pos > 0:
+            gt_masks_th = (
+                gt_masks.to_tensor(rois.dtype, rois.device).index_select(
+                    0, pos_assigned_gt_inds))
+            gt_masks_th = gt_masks_th.unsqueeze(1)
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, gt_masks_th)
+            point_targets = point_sample(gt_masks_th,
+                                         rel_img_points).squeeze(1)
+        else:
+            point_targets = rois.new_zeros((0, num_points))
+        return point_targets
+
+    def loss(self, point_pred, point_targets, labels):
+        """Calculate loss for MaskPointHead.
+
+        Args:
+            point_pred (Tensor): Point predication result, shape
+                (num_rois, num_classes, num_points).
+            point_targets (Tensor): Point targets, shape (num_roi, num_points).
+            labels (Tensor): Class label of corresponding boxes,
+                shape (num_rois, )
+
+        Returns:
+            dict[str, Tensor]: a dictionary of point loss components
+        """
+
+        loss = dict()
+        if self.class_agnostic:
+            loss_point = self.loss_point(point_pred, point_targets,
+                                         torch.zeros_like(labels))
+        else:
+            loss_point = self.loss_point(point_pred, point_targets, labels)
+        loss['loss_point'] = loss_point
+        return loss
+
+    def get_roi_rel_points_train(self, mask_pred, labels, cfg):
+        """Get ``num_points`` most uncertain points with random points during
+        train.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        '_get_uncertainty()' function that takes point's logit prediction as
+        input.
+
+        Args:
+            mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            labels (list): The ground truth class for each instance.
+            cfg (dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+                that contains the coordinates sampled points.
+        """
+        point_coords = get_uncertain_point_coords_with_randomness(
+            mask_pred, labels, cfg.num_points, cfg.oversample_ratio,
+            cfg.importance_sample_ratio)
+        return point_coords
+
+    def get_roi_rel_points_test(self, mask_pred, pred_label, cfg):
+        """Get ``num_points`` most uncertain points during test.
+
+        Args:
+            mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            pred_label (list): The predication class for each instance.
+            cfg (dict): Testing config of point head.
+
+        Returns:
+            point_indices (Tensor): A tensor of shape (num_rois, num_points)
+                that contains indices from [0, mask_height x mask_width) of the
+                most uncertain points.
+            point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+                that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the [mask_height, mask_width] grid .
+        """
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = get_uncertainty(mask_pred, pred_label)
+        num_rois, _, mask_height, mask_width = uncertainty_map.shape
+
+        # During ONNX exporting, the type of each elements of 'shape' is
+        # `Tensor(float)`, while it is `float` during PyTorch inference.
+        if isinstance(mask_height, torch.Tensor):
+            h_step = 1.0 / mask_height.float()
+            w_step = 1.0 / mask_width.float()
+        else:
+            h_step = 1.0 / mask_height
+            w_step = 1.0 / mask_width
+        # cast to int to avoid dynamic K for TopK op in ONNX
+        mask_size = int(mask_height * mask_width)
+        uncertainty_map = uncertainty_map.view(num_rois, mask_size)
+        num_points = min(mask_size, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        xs = w_step / 2.0 + (point_indices % mask_width).float() * w_step
+        ys = h_step / 2.0 + (point_indices // mask_width).float() * h_step
+        point_coords = torch.stack([xs, ys], dim=2)
+        return point_indices, point_coords
diff --git a/mmdet/models/roi_heads/mask_heads/maskiou_head.py b/mmdet/models/roi_heads/mask_heads/maskiou_head.py
new file mode 100755
index 0000000..a7ff7c7
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/maskiou_head.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import Conv2d, Linear, MaxPool2d
+from mmcv.runner import BaseModule, force_fp32
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class MaskIoUHead(BaseModule):
+    """Mask IoU Head.
+
+    This head predicts the IoU of predicted masks and corresponding gt masks.
+    """
+
+    def __init__(self,
+                 num_convs=4,
+                 num_fcs=2,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_out_channels=256,
+                 fc_out_channels=1024,
+                 num_classes=80,
+                 loss_iou=dict(type='MSELoss', loss_weight=0.5),
+                 init_cfg=[
+                     dict(type='Kaiming', override=dict(name='convs')),
+                     dict(type='Caffe2Xavier', override=dict(name='fcs')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         override=dict(name='fc_mask_iou'))
+                 ]):
+        super(MaskIoUHead, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.num_classes = num_classes
+        self.fp16_enabled = False
+
+        self.convs = nn.ModuleList()
+        for i in range(num_convs):
+            if i == 0:
+                # concatenation of mask feature and mask prediction
+                in_channels = self.in_channels + 1
+            else:
+                in_channels = self.conv_out_channels
+            stride = 2 if i == num_convs - 1 else 1
+            self.convs.append(
+                Conv2d(
+                    in_channels,
+                    self.conv_out_channels,
+                    3,
+                    stride=stride,
+                    padding=1))
+
+        roi_feat_size = _pair(roi_feat_size)
+        pooled_area = (roi_feat_size[0] // 2) * (roi_feat_size[1] // 2)
+        self.fcs = nn.ModuleList()
+        for i in range(num_fcs):
+            in_channels = (
+                self.conv_out_channels *
+                pooled_area if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(in_channels, self.fc_out_channels))
+
+        self.fc_mask_iou = Linear(self.fc_out_channels, self.num_classes)
+        self.relu = nn.ReLU()
+        self.max_pool = MaxPool2d(2, 2)
+        self.loss_iou = build_loss(loss_iou)
+
+    def forward(self, mask_feat, mask_pred):
+        mask_pred = mask_pred.sigmoid()
+        mask_pred_pooled = self.max_pool(mask_pred.unsqueeze(1))
+
+        x = torch.cat((mask_feat, mask_pred_pooled), 1)
+
+        for conv in self.convs:
+            x = self.relu(conv(x))
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_iou = self.fc_mask_iou(x)
+        return mask_iou
+
+    @force_fp32(apply_to=('mask_iou_pred', ))
+    def loss(self, mask_iou_pred, mask_iou_targets):
+        pos_inds = mask_iou_targets > 0
+        if pos_inds.sum() > 0:
+            loss_mask_iou = self.loss_iou(mask_iou_pred[pos_inds],
+                                          mask_iou_targets[pos_inds])
+        else:
+            loss_mask_iou = mask_iou_pred.sum() * 0
+        return dict(loss_mask_iou=loss_mask_iou)
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def get_targets(self, sampling_results, gt_masks, mask_pred, mask_targets,
+                    rcnn_train_cfg):
+        """Compute target of mask IoU.
+
+        Mask IoU target is the IoU of the predicted mask (inside a bbox) and
+        the gt mask of corresponding gt mask (the whole instance).
+        The intersection area is computed inside the bbox, and the gt mask area
+        is computed with two steps, firstly we compute the gt area inside the
+        bbox, then divide it by the area ratio of gt area inside the bbox and
+        the gt area of the whole instance.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]): sampling results.
+            gt_masks (BitmapMask | PolygonMask): Gt masks (the whole instance)
+                of each image, with the same shape of the input image.
+            mask_pred (Tensor): Predicted masks of each positive proposal,
+                shape (num_pos, h, w).
+            mask_targets (Tensor): Gt mask of each positive proposal,
+                binary map of the shape (num_pos, h, w).
+            rcnn_train_cfg (dict): Training config for R-CNN part.
+
+        Returns:
+            Tensor: mask iou target (length == num positive).
+        """
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+
+        # compute the area ratio of gt areas inside the proposals and
+        # the whole instance
+        area_ratios = map(self._get_area_ratio, pos_proposals,
+                          pos_assigned_gt_inds, gt_masks)
+        area_ratios = torch.cat(list(area_ratios))
+        assert mask_targets.size(0) == area_ratios.size(0)
+
+        mask_pred = (mask_pred > rcnn_train_cfg.mask_thr_binary).float()
+        mask_pred_areas = mask_pred.sum((-1, -2))
+
+        # mask_pred and mask_targets are binary maps
+        overlap_areas = (mask_pred * mask_targets).sum((-1, -2))
+
+        # compute the mask area of the whole instance
+        gt_full_areas = mask_targets.sum((-1, -2)) / (area_ratios + 1e-7)
+
+        mask_iou_targets = overlap_areas / (
+            mask_pred_areas + gt_full_areas - overlap_areas)
+        return mask_iou_targets
+
+    def _get_area_ratio(self, pos_proposals, pos_assigned_gt_inds, gt_masks):
+        """Compute area ratio of the gt mask inside the proposal and the gt
+        mask of the corresponding instance."""
+        num_pos = pos_proposals.size(0)
+        if num_pos > 0:
+            area_ratios = []
+            proposals_np = pos_proposals.cpu().numpy()
+            pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+            # compute mask areas of gt instances (batch processing for speedup)
+            gt_instance_mask_area = gt_masks.areas
+            for i in range(num_pos):
+                gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+
+                # crop the gt mask inside the proposal
+                bbox = proposals_np[i, :].astype(np.int32)
+                gt_mask_in_proposal = gt_mask.crop(bbox)
+
+                ratio = gt_mask_in_proposal.areas[0] / (
+                    gt_instance_mask_area[pos_assigned_gt_inds[i]] + 1e-7)
+                area_ratios.append(ratio)
+            area_ratios = torch.from_numpy(np.stack(area_ratios)).float().to(
+                pos_proposals.device)
+        else:
+            area_ratios = pos_proposals.new_zeros((0, ))
+        return area_ratios
+
+    @force_fp32(apply_to=('mask_iou_pred', ))
+    def get_mask_scores(self, mask_iou_pred, det_bboxes, det_labels):
+        """Get the mask scores.
+
+        mask_score = bbox_score * mask_iou
+        """
+        inds = range(det_labels.size(0))
+        mask_scores = mask_iou_pred[inds, det_labels] * det_bboxes[inds, -1]
+        mask_scores = mask_scores.cpu().numpy()
+        det_labels = det_labels.cpu().numpy()
+        return [mask_scores[det_labels == i] for i in range(self.num_classes)]
diff --git a/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py b/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
new file mode 100755
index 0000000..ca62486
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.builder import HEADS
+from mmdet.models.utils import ResLayer, SimplifiedBasicBlock
+from .fcn_mask_head import FCNMaskHead
+
+
+@HEADS.register_module()
+class SCNetMaskHead(FCNMaskHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res=True, **kwargs):
+        super(SCNetMaskHead, self).__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if conv_to_res:
+            assert self.conv_kernel_size == 3
+            self.num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                self.num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
diff --git a/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py b/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
new file mode 100755
index 0000000..2b8c5c3
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.builder import HEADS
+from mmdet.models.utils import ResLayer, SimplifiedBasicBlock
+from .fused_semantic_head import FusedSemanticHead
+
+
+@HEADS.register_module()
+class SCNetSemanticHead(FusedSemanticHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res=True, **kwargs):
+        super(SCNetSemanticHead, self).__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if self.conv_to_res:
+            num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
diff --git a/mmdet/models/roi_heads/mask_scoring_roi_head.py b/mmdet/models/roi_heads/mask_scoring_roi_head.py
new file mode 100755
index 0000000..4617988
--- /dev/null
+++ b/mmdet/models/roi_heads/mask_scoring_roi_head.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import bbox2roi
+from ..builder import HEADS, build_head
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class MaskScoringRoIHead(StandardRoIHead):
+    """Mask Scoring RoIHead for Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self, mask_iou_head, **kwargs):
+        assert mask_iou_head is not None
+        super(MaskScoringRoIHead, self).__init__(**kwargs)
+        self.mask_iou_head = build_head(mask_iou_head)
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for Mask head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        mask_results = super(MaskScoringRoIHead,
+                             self)._mask_forward_train(x, sampling_results,
+                                                       bbox_feats, gt_masks,
+                                                       img_metas)
+        if mask_results['loss_mask'] is None:
+            return mask_results
+
+        # mask iou head forward and loss
+        pos_mask_pred = mask_results['mask_pred'][
+            range(mask_results['mask_pred'].size(0)), pos_labels]
+        mask_iou_pred = self.mask_iou_head(mask_results['mask_feats'],
+                                           pos_mask_pred)
+        pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)),
+                                          pos_labels]
+
+        mask_iou_targets = self.mask_iou_head.get_targets(
+            sampling_results, gt_masks, pos_mask_pred,
+            mask_results['mask_targets'], self.train_cfg)
+        loss_mask_iou = self.mask_iou_head.loss(pos_mask_iou_pred,
+                                                mask_iou_targets)
+        mask_results['loss_mask'].update(loss_mask_iou)
+        return mask_results
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Obtain mask prediction without augmentation."""
+        # image shapes of images in the batch
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        num_imgs = len(det_bboxes)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            num_classes = self.mask_head.num_classes
+            segm_results = [[[] for _ in range(num_classes)]
+                            for _ in range(num_imgs)]
+            mask_scores = [[[] for _ in range(num_classes)]
+                           for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4] *
+                scale_factors[i] if rescale else det_bboxes[i]
+                for i in range(num_imgs)
+            ]
+            mask_rois = bbox2roi(_bboxes)
+            mask_results = self._mask_forward(x, mask_rois)
+            concat_det_labels = torch.cat(det_labels)
+            # get mask scores with mask iou head
+            mask_feats = mask_results['mask_feats']
+            mask_pred = mask_results['mask_pred']
+            mask_iou_pred = self.mask_iou_head(
+                mask_feats, mask_pred[range(concat_det_labels.size(0)),
+                                      concat_det_labels])
+            # split batch mask prediction back to each image
+            num_bboxes_per_img = tuple(len(_bbox) for _bbox in _bboxes)
+            mask_preds = mask_pred.split(num_bboxes_per_img, 0)
+            mask_iou_preds = mask_iou_pred.split(num_bboxes_per_img, 0)
+
+            # apply mask post-processing to each image individually
+            segm_results = []
+            mask_scores = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                    mask_scores.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_preds[i], _bboxes[i], det_labels[i],
+                        self.test_cfg, ori_shapes[i], scale_factors[i],
+                        rescale)
+                    # get mask scores with mask iou head
+                    mask_score = self.mask_iou_head.get_mask_scores(
+                        mask_iou_preds[i], det_bboxes[i], det_labels[i])
+                    segm_results.append(segm_result)
+                    mask_scores.append(mask_score)
+        return list(zip(segm_results, mask_scores))
diff --git a/mmdet/models/roi_heads/pisa_roi_head.py b/mmdet/models/roi_heads/pisa_roi_head.py
new file mode 100755
index 0000000..92a5118
--- /dev/null
+++ b/mmdet/models/roi_heads/pisa_roi_head.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.core import bbox2roi
+from ..builder import HEADS
+from ..losses.pisa_loss import carl_loss, isr_p
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class PISARoIHead(StandardRoIHead):
+    r"""The RoI head for `Prime Sample Attention in Object Detection
+    <https://arxiv.org/abs/1904.04821>`_."""
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """Forward function for training.
+
+        Args:
+            x (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            proposals (list[Tensors]): List of region proposals.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (list[Tensor], optional): Specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : True segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # assign gts and sample proposals
+        if self.with_bbox or self.with_mask:
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            neg_label_weights = []
+            for i in range(num_imgs):
+                assign_result = self.bbox_assigner.assign(
+                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
+                    gt_labels[i])
+                sampling_result = self.bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                # neg label weight is obtained by sampling when using ISR-N
+                neg_label_weight = None
+                if isinstance(sampling_result, tuple):
+                    sampling_result, neg_label_weight = sampling_result
+                sampling_results.append(sampling_result)
+                neg_label_weights.append(neg_label_weight)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(
+                x,
+                sampling_results,
+                gt_bboxes,
+                gt_labels,
+                img_metas,
+                neg_label_weights=neg_label_weights)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self._mask_forward_train(x, sampling_results,
+                                                    bbox_results['bbox_feats'],
+                                                    gt_masks, img_metas)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def _bbox_forward(self, x, rois):
+        """Box forward function used in both training and testing."""
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def _bbox_forward_train(self,
+                            x,
+                            sampling_results,
+                            gt_bboxes,
+                            gt_labels,
+                            img_metas,
+                            neg_label_weights=None):
+        """Run forward function and calculate loss for box head in training."""
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.train_cfg)
+
+        # neg_label_weights obtained by sampler is image-wise, mapping back to
+        # the corresponding location in label weights
+        if neg_label_weights[0] is not None:
+            label_weights = bbox_targets[1]
+            cur_num_rois = 0
+            for i in range(len(sampling_results)):
+                num_pos = sampling_results[i].pos_inds.size(0)
+                num_neg = sampling_results[i].neg_inds.size(0)
+                label_weights[cur_num_rois + num_pos:cur_num_rois + num_pos +
+                              num_neg] = neg_label_weights[i]
+                cur_num_rois += num_pos + num_neg
+
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            bbox_targets = isr_p(
+                cls_score,
+                bbox_pred,
+                bbox_targets,
+                rois,
+                sampling_results,
+                self.bbox_head.loss_cls,
+                self.bbox_head.bbox_coder,
+                **isr_cfg,
+                num_class=self.bbox_head.num_classes)
+        loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, rois,
+                                        *bbox_targets)
+
+        # Add CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                cls_score,
+                bbox_targets[0],
+                bbox_pred,
+                bbox_targets[2],
+                self.bbox_head.loss_bbox,
+                **carl_cfg,
+                num_class=self.bbox_head.num_classes)
+            loss_bbox.update(loss_carl)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
diff --git a/mmdet/models/roi_heads/point_rend_roi_head.py b/mmdet/models/roi_heads/point_rend_roi_head.py
new file mode 100755
index 0000000..9f66779
--- /dev/null
+++ b/mmdet/models/roi_heads/point_rend_roi_head.py
@@ -0,0 +1,393 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+import os
+import warnings
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+
+from mmdet.core import bbox2roi, bbox_mapping, merge_aug_masks
+from .. import builder
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class PointRendRoIHead(StandardRoIHead):
+    """`PointRend <https://arxiv.org/abs/1912.08193>`_."""
+
+    def __init__(self, point_head, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.with_bbox and self.with_mask
+        self.init_point_head(point_head)
+
+    def init_point_head(self, point_head):
+        """Initialize ``point_head``"""
+        self.point_head = builder.build_head(point_head)
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for mask head and point head
+        in training."""
+        mask_results = super()._mask_forward_train(x, sampling_results,
+                                                   bbox_feats, gt_masks,
+                                                   img_metas)
+        if mask_results['loss_mask'] is not None:
+            loss_point = self._mask_point_forward_train(
+                x, sampling_results, mask_results['mask_pred'], gt_masks,
+                img_metas)
+            mask_results['loss_mask'].update(loss_point)
+
+        return mask_results
+
+    def _mask_point_forward_train(self, x, sampling_results, mask_pred,
+                                  gt_masks, img_metas):
+        """Run forward function and calculate loss for point head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        rel_roi_points = self.point_head.get_roi_rel_points_train(
+            mask_pred, pos_labels, cfg=self.train_cfg)
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, rois, rel_roi_points, img_metas)
+        coarse_point_feats = point_sample(mask_pred, rel_roi_points)
+        mask_point_pred = self.point_head(fine_grained_point_feats,
+                                          coarse_point_feats)
+        mask_point_target = self.point_head.get_targets(
+            rois, rel_roi_points, sampling_results, gt_masks, self.train_cfg)
+        loss_mask_point = self.point_head.loss(mask_point_pred,
+                                               mask_point_target, pos_labels)
+
+        return loss_mask_point
+
+    def _get_fine_grained_point_feats(self, x, rois, rel_roi_points,
+                                      img_metas):
+        """Sample fine grained feats from each level feature map and
+        concatenate them together.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            rel_roi_points (Tensor): A tensor of shape (num_rois, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the [mask_height, mask_width] grid.
+            img_metas (list[dict]): Image meta info.
+
+        Returns:
+            Tensor: The fine grained features for each points,
+                has shape (num_rois, feats_channels, num_points).
+        """
+        num_imgs = len(img_metas)
+        fine_grained_feats = []
+        for idx in range(self.mask_roi_extractor.num_inputs):
+            feats = x[idx]
+            spatial_scale = 1. / float(
+                self.mask_roi_extractor.featmap_strides[idx])
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = feats[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois[inds], rel_roi_points[inds], feat.shape[2:],
+                        spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(feat, rel_img_points)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+            fine_grained_feats.append(torch.cat(point_feats, dim=0))
+        return torch.cat(fine_grained_feats, dim=1)
+
+    def _mask_point_forward_test(self, x, rois, label_pred, mask_pred,
+                                 img_metas):
+        """Mask refining process with point head in testing.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            label_pred (Tensor): The predication class for each rois.
+            mask_pred (Tensor): The predication coarse masks of
+                shape (num_rois, num_classes, small_size, small_size).
+            img_metas (list[dict]): Image meta info.
+
+        Returns:
+            Tensor: The refined masks of shape (num_rois, num_classes,
+                large_size, large_size).
+        """
+        refined_mask_pred = mask_pred.clone()
+        for subdivision_step in range(self.test_cfg.subdivision_steps):
+            refined_mask_pred = F.interpolate(
+                refined_mask_pred,
+                scale_factor=self.test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=False)
+            # If `subdivision_num_points` is larger or equal to the
+            # resolution of the next step, then we can skip this step
+            num_rois, channels, mask_height, mask_width = \
+                refined_mask_pred.shape
+            if (self.test_cfg.subdivision_num_points >=
+                    self.test_cfg.scale_factor**2 * mask_height * mask_width
+                    and
+                    subdivision_step < self.test_cfg.subdivision_steps - 1):
+                continue
+            point_indices, rel_roi_points = \
+                self.point_head.get_roi_rel_points_test(
+                    refined_mask_pred, label_pred, cfg=self.test_cfg)
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x, rois, rel_roi_points, img_metas)
+            coarse_point_feats = point_sample(mask_pred, rel_roi_points)
+            mask_point_pred = self.point_head(fine_grained_point_feats,
+                                              coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_mask_pred = refined_mask_pred.reshape(
+                num_rois, channels, mask_height * mask_width)
+            refined_mask_pred = refined_mask_pred.scatter_(
+                2, point_indices, mask_point_pred)
+            refined_mask_pred = refined_mask_pred.view(num_rois, channels,
+                                                       mask_height, mask_width)
+
+        return refined_mask_pred
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Obtain mask prediction without augmentation."""
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        if isinstance(scale_factors[0], float):
+            warnings.warn(
+                'Scale factor in img_metas should be a '
+                'ndarray with shape (4,) '
+                'arrange as (factor_w, factor_h, factor_w, factor_h), '
+                'The scale_factor with float type has been deprecated. ')
+            scale_factors = np.array([scale_factors] * 4, dtype=np.float32)
+
+        num_imgs = len(det_bboxes)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            segm_results = [[[] for _ in range(self.mask_head.num_classes)]
+                            for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            _bboxes = [det_bboxes[i][:, :4] for i in range(len(det_bboxes))]
+            if rescale:
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+                _bboxes = [
+                    _bboxes[i] * scale_factors[i] for i in range(len(_bboxes))
+                ]
+
+            mask_rois = bbox2roi(_bboxes)
+            mask_results = self._mask_forward(x, mask_rois)
+            # split batch mask prediction back to each image
+            mask_pred = mask_results['mask_pred']
+            num_mask_roi_per_img = [len(det_bbox) for det_bbox in det_bboxes]
+            mask_preds = mask_pred.split(num_mask_roi_per_img, 0)
+            mask_rois = mask_rois.split(num_mask_roi_per_img, 0)
+
+            # apply mask post-processing to each image individually
+            segm_results = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    x_i = [xx[[i]] for xx in x]
+                    mask_rois_i = mask_rois[i]
+                    mask_rois_i[:, 0] = 0  # TODO: remove this hack
+                    mask_pred_i = self._mask_point_forward_test(
+                        x_i, mask_rois_i, det_labels[i], mask_preds[i],
+                        [img_metas])
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_pred_i, _bboxes[i], det_labels[i], self.test_cfg,
+                        ori_shapes[i], scale_factors[i], rescale)
+                    segm_results.append(segm_result)
+        return segm_results
+
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        """Test for mask head with test time augmentation."""
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip)
+                mask_rois = bbox2roi([_bboxes])
+                mask_results = self._mask_forward(x, mask_rois)
+                mask_results['mask_pred'] = self._mask_point_forward_test(
+                    x, mask_rois, det_labels, mask_results['mask_pred'],
+                    img_meta)
+                # convert to numpy array to save memory
+                aug_masks.append(
+                    mask_results['mask_pred'].sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg,
+                ori_shape,
+                scale_factor=1.0,
+                rescale=False)
+        return segm_result
+
+    def _onnx_get_fine_grained_point_feats(self, x, rois, rel_roi_points):
+        """Export the process of sampling fine grained feats to onnx.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            rel_roi_points (Tensor): A tensor of shape (num_rois, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the [mask_height, mask_width] grid.
+
+        Returns:
+            Tensor: The fine grained features for each points,
+                has shape (num_rois, feats_channels, num_points).
+        """
+        batch_size = x[0].shape[0]
+        num_rois = rois.shape[0]
+        fine_grained_feats = []
+        for idx in range(self.mask_roi_extractor.num_inputs):
+            feats = x[idx]
+            spatial_scale = 1. / float(
+                self.mask_roi_extractor.featmap_strides[idx])
+
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, feats, spatial_scale)
+            channels = feats.shape[1]
+            num_points = rel_img_points.shape[1]
+            rel_img_points = rel_img_points.reshape(batch_size, -1, num_points,
+                                                    2)
+            point_feats = point_sample(feats, rel_img_points)
+            point_feats = point_feats.transpose(1, 2).reshape(
+                num_rois, channels, num_points)
+            fine_grained_feats.append(point_feats)
+        return torch.cat(fine_grained_feats, dim=1)
+
+    def _mask_point_onnx_export(self, x, rois, label_pred, mask_pred):
+        """Export mask refining process with point head to onnx.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            label_pred (Tensor): The predication class for each rois.
+            mask_pred (Tensor): The predication coarse masks of
+                shape (num_rois, num_classes, small_size, small_size).
+
+        Returns:
+            Tensor: The refined masks of shape (num_rois, num_classes,
+                large_size, large_size).
+        """
+        refined_mask_pred = mask_pred.clone()
+        for subdivision_step in range(self.test_cfg.subdivision_steps):
+            refined_mask_pred = F.interpolate(
+                refined_mask_pred,
+                scale_factor=self.test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=False)
+            # If `subdivision_num_points` is larger or equal to the
+            # resolution of the next step, then we can skip this step
+            num_rois, channels, mask_height, mask_width = \
+                refined_mask_pred.shape
+            if (self.test_cfg.subdivision_num_points >=
+                    self.test_cfg.scale_factor**2 * mask_height * mask_width
+                    and
+                    subdivision_step < self.test_cfg.subdivision_steps - 1):
+                continue
+            point_indices, rel_roi_points = \
+                self.point_head.get_roi_rel_points_test(
+                    refined_mask_pred, label_pred, cfg=self.test_cfg)
+            fine_grained_point_feats = self._onnx_get_fine_grained_point_feats(
+                x, rois, rel_roi_points)
+            coarse_point_feats = point_sample(mask_pred, rel_roi_points)
+            mask_point_pred = self.point_head(fine_grained_point_feats,
+                                              coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_mask_pred = refined_mask_pred.reshape(
+                num_rois, channels, mask_height * mask_width)
+
+            is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
+            # avoid ScatterElements op in ONNX for TensorRT
+            if is_trt_backend:
+                mask_shape = refined_mask_pred.shape
+                point_shape = point_indices.shape
+                inds_dim0 = torch.arange(point_shape[0]).reshape(
+                    point_shape[0], 1, 1).expand_as(point_indices)
+                inds_dim1 = torch.arange(point_shape[1]).reshape(
+                    1, point_shape[1], 1).expand_as(point_indices)
+                inds_1d = inds_dim0.reshape(
+                    -1) * mask_shape[1] * mask_shape[2] + inds_dim1.reshape(
+                        -1) * mask_shape[2] + point_indices.reshape(-1)
+                refined_mask_pred = refined_mask_pred.reshape(-1)
+                refined_mask_pred[inds_1d] = mask_point_pred.reshape(-1)
+                refined_mask_pred = refined_mask_pred.reshape(*mask_shape)
+            else:
+                refined_mask_pred = refined_mask_pred.scatter_(
+                    2, point_indices, mask_point_pred)
+
+            refined_mask_pred = refined_mask_pred.view(num_rois, channels,
+                                                       mask_height, mask_width)
+
+        return refined_mask_pred
+
+    def mask_onnx_export(self, x, img_metas, det_bboxes, det_labels, **kwargs):
+        """Export mask branch to onnx which supports batch inference.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            img_metas (list[dict]): Image meta info.
+            det_bboxes (Tensor): Bboxes and corresponding scores.
+                has shape [N, num_bboxes, 5].
+            det_labels (Tensor): class labels of
+                shape [N, num_bboxes].
+
+        Returns:
+            Tensor: The segmentation results of shape [N, num_bboxes,
+                image_height, image_width].
+        """
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            raise RuntimeError('[ONNX Error] Can not record MaskHead '
+                               'as it has not been executed this time')
+        batch_size = det_bboxes.size(0)
+        # if det_bboxes is rescaled to the original image size, we need to
+        # rescale it back to the testing scale to obtain RoIs.
+        det_bboxes = det_bboxes[..., :4]
+        batch_index = torch.arange(
+            det_bboxes.size(0), device=det_bboxes.device).float().view(
+                -1, 1, 1).expand(det_bboxes.size(0), det_bboxes.size(1), 1)
+        mask_rois = torch.cat([batch_index, det_bboxes], dim=-1)
+        mask_rois = mask_rois.view(-1, 5)
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_pred = mask_results['mask_pred']
+        max_shape = img_metas[0]['img_shape_for_onnx']
+        num_det = det_bboxes.shape[1]
+        det_bboxes = det_bboxes.reshape(-1, 4)
+        det_labels = det_labels.reshape(-1)
+
+        mask_pred = self._mask_point_onnx_export(x, mask_rois, det_labels,
+                                                 mask_pred)
+
+        segm_results = self.mask_head.onnx_export(mask_pred, det_bboxes,
+                                                  det_labels, self.test_cfg,
+                                                  max_shape)
+        segm_results = segm_results.reshape(batch_size, num_det, max_shape[0],
+                                            max_shape[1])
+        return segm_results
diff --git a/mmdet/models/roi_heads/roi_extractors/__init__.py b/mmdet/models/roi_heads/roi_extractors/__init__.py
new file mode 100755
index 0000000..0f60214
--- /dev/null
+++ b/mmdet/models/roi_heads/roi_extractors/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_extractor import BaseRoIExtractor
+from .generic_roi_extractor import GenericRoIExtractor
+from .single_level_roi_extractor import SingleRoIExtractor
+
+__all__ = ['BaseRoIExtractor', 'SingleRoIExtractor', 'GenericRoIExtractor']
diff --git a/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
new file mode 100755
index 0000000..8262975
--- /dev/null
+++ b/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv import ops
+from mmcv.runner import BaseModule
+
+
+class BaseRoIExtractor(BaseModule, metaclass=ABCMeta):
+    """Base class for RoI extractor.
+
+    Args:
+        roi_layer (dict): Specify RoI layer type and arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (int): Strides of input feature maps.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 roi_layer,
+                 out_channels,
+                 featmap_strides,
+                 init_cfg=None):
+        super(BaseRoIExtractor, self).__init__(init_cfg)
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.fp16_enabled = False
+
+    @property
+    def num_inputs(self):
+        """int: Number of input feature maps."""
+        return len(self.featmap_strides)
+
+    def build_roi_layers(self, layer_cfg, featmap_strides):
+        """Build RoI operator to extract feature from each level feature map.
+
+        Args:
+            layer_cfg (dict): Dictionary to construct and config RoI layer
+                operation. Options are modules under ``mmcv/ops`` such as
+                ``RoIAlign``.
+            featmap_strides (List[int]): The stride of input feature map w.r.t
+                to the original image size, which would be used to scale RoI
+                coordinate (original image coordinate system) to feature
+                coordinate system.
+
+        Returns:
+            nn.ModuleList: The RoI extractor modules for each level feature
+                map.
+        """
+
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+
+    def roi_rescale(self, rois, scale_factor):
+        """Scale RoI coordinates by scale factor.
+
+        Args:
+            rois (torch.Tensor): RoI (Region of Interest), shape (n, 5)
+            scale_factor (float): Scale factor that RoI will be multiplied by.
+
+        Returns:
+            torch.Tensor: Scaled RoI.
+        """
+
+        cx = (rois[:, 1] + rois[:, 3]) * 0.5
+        cy = (rois[:, 2] + rois[:, 4]) * 0.5
+        w = rois[:, 3] - rois[:, 1]
+        h = rois[:, 4] - rois[:, 2]
+        new_w = w * scale_factor
+        new_h = h * scale_factor
+        x1 = cx - new_w * 0.5
+        x2 = cx + new_w * 0.5
+        y1 = cy - new_h * 0.5
+        y2 = cy + new_h * 0.5
+        new_rois = torch.stack((rois[:, 0], x1, y1, x2, y2), dim=-1)
+        return new_rois
+
+    @abstractmethod
+    def forward(self, feats, rois, roi_scale_factor=None):
+        pass
diff --git a/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py b/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
new file mode 100755
index 0000000..89a9f89
--- /dev/null
+++ b/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn.bricks import build_plugin_layer
+from mmcv.runner import force_fp32
+
+from mmdet.models.builder import ROI_EXTRACTORS
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@ROI_EXTRACTORS.register_module()
+class GenericRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from all level feature maps levels.
+
+    This is the implementation of `A novel Region of Interest Extraction Layer
+    for Instance Segmentation <https://arxiv.org/abs/2004.13665>`_.
+
+    Args:
+        aggregation (str): The method to aggregate multiple feature maps.
+            Options are 'sum', 'concat'. Default: 'sum'.
+        pre_cfg (dict | None): Specify pre-processing modules. Default: None.
+        post_cfg (dict | None): Specify post-processing modules. Default: None.
+        kwargs (keyword arguments): Arguments that are the same
+            as :class:`BaseRoIExtractor`.
+    """
+
+    def __init__(self,
+                 aggregation='sum',
+                 pre_cfg=None,
+                 post_cfg=None,
+                 **kwargs):
+        super(GenericRoIExtractor, self).__init__(**kwargs)
+
+        assert aggregation in ['sum', 'concat']
+
+        self.aggregation = aggregation
+        self.with_post = post_cfg is not None
+        self.with_pre = pre_cfg is not None
+        # build pre/post processing modules
+        if self.with_post:
+            self.post_module = build_plugin_layer(post_cfg, '_post_module')[1]
+        if self.with_pre:
+            self.pre_module = build_plugin_layer(pre_cfg, '_pre_module')[1]
+
+    @force_fp32(apply_to=('feats', ), out_fp16=True)
+    def forward(self, feats, rois, roi_scale_factor=None):
+        """Forward function."""
+        if len(feats) == 1:
+            return self.roi_layers[0](feats[0], rois)
+
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # some times rois is an empty tensor
+        if roi_feats.shape[0] == 0:
+            return roi_feats
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        # mark the starting channels for concat mode
+        start_channels = 0
+        for i in range(num_levels):
+            roi_feats_t = self.roi_layers[i](feats[i], rois)
+            end_channels = start_channels + roi_feats_t.size(1)
+            if self.with_pre:
+                # apply pre-processing to a RoI extracted from each layer
+                roi_feats_t = self.pre_module(roi_feats_t)
+            if self.aggregation == 'sum':
+                # and sum them all
+                roi_feats = roi_feats + roi_feats_t
+            else:
+                # and concat them along channel dimension
+                roi_feats[:, start_channels:end_channels] = roi_feats_t
+            # update channels starting position
+            start_channels = end_channels
+        # check if concat channels match at the end
+        if self.aggregation == 'concat':
+            assert start_channels == self.out_channels
+
+        if self.with_post:
+            # apply post-processing before return the result
+            roi_feats = self.post_module(roi_feats)
+        return roi_feats
diff --git a/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
new file mode 100755
index 0000000..dbc5aef
--- /dev/null
+++ b/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.models.builder import ROI_EXTRACTORS
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@ROI_EXTRACTORS.register_module()
+class SingleRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from a single level feature map.
+
+    If there are multiple input feature levels, each RoI is mapped to a level
+    according to its scale. The mapping rule is proposed in
+    `FPN <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        roi_layer (dict): Specify RoI layer type and arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (List[int]): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0. Default: 56.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 roi_layer,
+                 out_channels,
+                 featmap_strides,
+                 finest_scale=56,
+                 init_cfg=None):
+        super(SingleRoIExtractor, self).__init__(roi_layer, out_channels,
+                                                 featmap_strides, init_cfg)
+        self.finest_scale = finest_scale
+
+    def map_roi_levels(self, rois, num_levels):
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale * 2: level 0
+        - finest_scale * 2 <= scale < finest_scale * 4: level 1
+        - finest_scale * 4 <= scale < finest_scale * 8: level 2
+        - scale >= finest_scale * 8: level 3
+
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
+        """
+        scale = torch.sqrt(
+            (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]))
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    @force_fp32(apply_to=('feats', ), out_fp16=True)
+    def forward(self, feats, rois, roi_scale_factor=None):
+        """Forward function."""
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        expand_dims = (-1, self.out_channels * out_size[0] * out_size[1])
+        if torch.onnx.is_in_onnx_export():
+            # Work around to export mask-rcnn to onnx
+            roi_feats = rois[:, :1].clone().detach()
+            roi_feats = roi_feats.expand(*expand_dims)
+            roi_feats = roi_feats.reshape(-1, self.out_channels, *out_size)
+            roi_feats = roi_feats * 0
+        else:
+            roi_feats = feats[0].new_zeros(
+                rois.size(0), self.out_channels, *out_size)
+
+        if num_levels == 1:
+            if len(rois) == 0:
+                return roi_feats
+            return self.roi_layers[0](feats[0], rois)
+
+        target_lvls = self.map_roi_levels(rois, num_levels)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        for i in range(num_levels):
+            mask = target_lvls == i
+            if torch.onnx.is_in_onnx_export():
+                # To keep all roi_align nodes exported to onnx
+                # and skip nonzero op
+                mask = mask.float().unsqueeze(-1)
+                # select target level rois and reset the rest rois to zero.
+                rois_i = rois.clone().detach()
+                rois_i = rois_i * mask
+                mask_exp = mask.expand(*expand_dims).reshape(roi_feats.shape)
+                roi_feats_t = self.roi_layers[i](feats[i], rois_i)
+                roi_feats_t = roi_feats_t * mask_exp
+                roi_feats = roi_feats + roi_feats_t
+                continue
+            inds = mask.nonzero(as_tuple=False).squeeze(1)
+            if inds.numel() > 0:
+                rois_ = rois[inds]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] = roi_feats_t
+            else:
+                # Sometimes some pyramid levels will not be used for RoI
+                # feature extraction and this will cause an incomplete
+                # computation graph in one GPU, which is different from those
+                # in other GPUs and will cause a hanging error.
+                # Therefore, we add it to ensure each feature pyramid is
+                # included in the computation graph to avoid runtime bugs.
+                roi_feats = roi_feats + sum(
+                    x.view(-1)[0]
+                    for x in self.parameters()) * 0. + feats[i].sum() * 0.
+        return roi_feats
diff --git a/mmdet/models/roi_heads/scnet_roi_head.py b/mmdet/models/roi_heads/scnet_roi_head.py
new file mode 100755
index 0000000..32f56aa
--- /dev/null
+++ b/mmdet/models/roi_heads/scnet_roi_head.py
@@ -0,0 +1,605 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from mmdet.core import (bbox2result, bbox2roi, bbox_mapping, merge_aug_bboxes,
+                        merge_aug_masks, multiclass_nms)
+from ..builder import HEADS, build_head, build_roi_extractor
+from ..utils.brick_wrappers import adaptive_avg_pool2d
+from .cascade_roi_head import CascadeRoIHead
+
+
+@HEADS.register_module()
+class SCNetRoIHead(CascadeRoIHead):
+    """RoIHead for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_stages (int): number of cascade stages.
+        stage_loss_weights (list): loss weight of cascade stages.
+        semantic_roi_extractor (dict): config to init semantic roi extractor.
+        semantic_head (dict): config to init semantic head.
+        feat_relay_head (dict): config to init feature_relay_head.
+        glbctx_head (dict): config to init global context head.
+    """
+
+    def __init__(self,
+                 num_stages,
+                 stage_loss_weights,
+                 semantic_roi_extractor=None,
+                 semantic_head=None,
+                 feat_relay_head=None,
+                 glbctx_head=None,
+                 **kwargs):
+        super(SCNetRoIHead, self).__init__(num_stages, stage_loss_weights,
+                                           **kwargs)
+        assert self.with_bbox and self.with_mask
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = build_roi_extractor(
+                semantic_roi_extractor)
+            self.semantic_head = build_head(semantic_head)
+
+        if feat_relay_head is not None:
+            self.feat_relay_head = build_head(feat_relay_head)
+
+        if glbctx_head is not None:
+            self.glbctx_head = build_head(glbctx_head)
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+        """Initialize ``mask_head``"""
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = build_roi_extractor(mask_roi_extractor)
+            self.mask_head = build_head(mask_head)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_feat_relay(self):
+        """bool: whether the head has feature relay head"""
+        return (hasattr(self, 'feat_relay_head')
+                and self.feat_relay_head is not None)
+
+    @property
+    def with_glbctx(self):
+        """bool: whether the head has global context head"""
+        return hasattr(self, 'glbctx_head') and self.glbctx_head is not None
+
+    def _fuse_glbctx(self, roi_feats, glbctx_feat, rois):
+        """Fuse global context feats with roi feats."""
+        assert roi_feats.size(0) == rois.size(0)
+        img_inds = torch.unique(rois[:, 0].cpu(), sorted=True).long()
+        fused_feats = torch.zeros_like(roi_feats)
+        for img_id in img_inds:
+            inds = (rois[:, 0] == img_id.item())
+            fused_feats[inds] = roi_feats[inds] + glbctx_feat[img_id]
+        return fused_feats
+
+    def _slice_pos_feats(self, feats, sampling_results):
+        """Get features from pos rois."""
+        num_rois = [res.bboxes.size(0) for res in sampling_results]
+        num_pos_rois = [res.pos_bboxes.size(0) for res in sampling_results]
+        inds = torch.zeros(sum(num_rois), dtype=torch.bool)
+        start = 0
+        for i in range(len(num_rois)):
+            start = 0 if i == 0 else start + num_rois[i - 1]
+            stop = start + num_pos_rois[i]
+            inds[start:stop] = 1
+        sliced_feats = feats[inds]
+        return sliced_feats
+
+    def _bbox_forward(self,
+                      stage,
+                      x,
+                      rois,
+                      semantic_feat=None,
+                      glbctx_feat=None):
+        """Box head forward function used in both training and testing."""
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(
+            x[:len(bbox_roi_extractor.featmap_strides)], rois)
+        if self.with_semantic and semantic_feat is not None:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats = bbox_feats + bbox_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            bbox_feats = self._fuse_glbctx(bbox_feats, glbctx_feat, rois)
+        cls_score, bbox_pred, relayed_feat = bbox_head(
+            bbox_feats, return_shared_feat=True)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            relayed_feat=relayed_feat)
+        return bbox_results
+
+    def _mask_forward(self,
+                      x,
+                      rois,
+                      semantic_feat=None,
+                      glbctx_feat=None,
+                      relayed_feat=None):
+        """Mask head forward function used in both training and testing."""
+        mask_feats = self.mask_roi_extractor(
+            x[:self.mask_roi_extractor.num_inputs], rois)
+        if self.with_semantic and semantic_feat is not None:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats = mask_feats + mask_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            mask_feats = self._fuse_glbctx(mask_feats, glbctx_feat, rois)
+        if self.with_feat_relay and relayed_feat is not None:
+            mask_feats = mask_feats + relayed_feat
+        mask_pred = self.mask_head(mask_feats)
+        mask_results = dict(mask_pred=mask_pred)
+
+        return mask_results
+
+    def _bbox_forward_train(self,
+                            stage,
+                            x,
+                            sampling_results,
+                            gt_bboxes,
+                            gt_labels,
+                            rcnn_train_cfg,
+                            semantic_feat=None,
+                            glbctx_feat=None):
+        """Run forward function and calculate loss for box head in training."""
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage,
+            x,
+            rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat)
+
+        bbox_targets = bbox_head.get_targets(sampling_results, gt_bboxes,
+                                             gt_labels, rcnn_train_cfg)
+        loss_bbox = bbox_head.loss(bbox_results['cls_score'],
+                                   bbox_results['bbox_pred'], rois,
+                                   *bbox_targets)
+
+        bbox_results.update(
+            loss_bbox=loss_bbox, rois=rois, bbox_targets=bbox_targets)
+        return bbox_results
+
+    def _mask_forward_train(self,
+                            x,
+                            sampling_results,
+                            gt_masks,
+                            rcnn_train_cfg,
+                            semantic_feat=None,
+                            glbctx_feat=None,
+                            relayed_feat=None):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        mask_results = self._mask_forward(
+            x,
+            pos_rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+
+        mask_targets = self.mask_head.get_targets(sampling_results, gt_masks,
+                                                  rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = self.mask_head.loss(mask_results['mask_pred'],
+                                        mask_targets, pos_labels)
+
+        mask_results = loss_mask
+        return mask_results
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      gt_semantic_seg=None):
+        """
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            proposal_list (list[Tensors]): list of region proposals.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None, list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None, Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+            gt_semantic_seg (None, list[Tensor]): semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        losses = dict()
+
+        # semantic segmentation branch
+        if self.with_semantic:
+            semantic_pred, semantic_feat = self.semantic_head(x)
+            loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg)
+            losses['loss_semantic_seg'] = loss_seg
+        else:
+            semantic_feat = None
+
+        # global context branch
+        if self.with_glbctx:
+            mc_pred, glbctx_feat = self.glbctx_head(x)
+            loss_glbctx = self.glbctx_head.loss(mc_pred, gt_labels)
+            losses['loss_glbctx'] = loss_glbctx
+        else:
+            glbctx_feat = None
+
+        for i in range(self.num_stages):
+            self.current_stage = i
+            rcnn_train_cfg = self.train_cfg[i]
+            lw = self.stage_loss_weights[i]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[i]
+            bbox_sampler = self.bbox_sampler[i]
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+
+            for j in range(num_imgs):
+                assign_result = bbox_assigner.assign(proposal_list[j],
+                                                     gt_bboxes[j],
+                                                     gt_bboxes_ignore[j],
+                                                     gt_labels[j])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[j],
+                    gt_bboxes[j],
+                    gt_labels[j],
+                    feats=[lvl_feat[j][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            bbox_results = \
+                self._bbox_forward_train(
+                    i, x, sampling_results, gt_bboxes, gt_labels,
+                    rcnn_train_cfg, semantic_feat, glbctx_feat)
+            roi_labels = bbox_results['bbox_targets'][0]
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{i}.{name}'] = (
+                    value * lw if 'loss' in name else value)
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                pos_is_gts = [res.pos_is_gt for res in sampling_results]
+                with torch.no_grad():
+                    proposal_list = self.bbox_head[i].refine_bboxes(
+                        bbox_results['rois'], roi_labels,
+                        bbox_results['bbox_pred'], pos_is_gts, img_metas)
+
+        if self.with_feat_relay:
+            relayed_feat = self._slice_pos_feats(bbox_results['relayed_feat'],
+                                                 sampling_results)
+            relayed_feat = self.feat_relay_head(relayed_feat)
+        else:
+            relayed_feat = None
+
+        mask_results = self._mask_forward_train(x, sampling_results, gt_masks,
+                                                rcnn_train_cfg, semantic_feat,
+                                                glbctx_feat, relayed_feat)
+        mask_lw = sum(self.stage_loss_weights)
+        losses['loss_mask'] = mask_lw * mask_results['loss_mask']
+
+        return losses
+
+    def simple_test(self, x, proposal_list, img_metas, rescale=False):
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (batch_size, c, h, w).
+            proposal_list (list(Tensor)): Proposals from rpn head.
+                Each has shape (num_proposals, 5), last dimension
+                5 represent (x1, y1, x2, y2, score).
+            img_metas (list[dict]): Meta information of images.
+            rescale (bool): Whether to rescale the results to
+                the original image. Default: True.
+
+        Returns:
+            list[list[np.ndarray]] or list[tuple]: When no mask branch,
+            it is bbox results of each image and classes with type
+            `list[list[np.ndarray]]`. The outer list
+            corresponds to each image. The inner list
+            corresponds to each class. When the model has mask branch,
+            it contains bbox results and mask results.
+            The outer list corresponds to each image, and first element
+            of tuple is bbox results, second element is mask results.
+        """
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        if self.with_glbctx:
+            mc_pred, glbctx_feat = self.glbctx_head(x)
+        else:
+            glbctx_feat = None
+
+        num_imgs = len(proposal_list)
+        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        # "ms" in variable names means multi-stage
+        ms_scores = []
+        rcnn_test_cfg = self.test_cfg
+
+        rois = bbox2roi(proposal_list)
+
+        if rois.shape[0] == 0:
+            # There is no proposal in the whole batch
+            bbox_results = [[
+                np.zeros((0, 5), dtype=np.float32)
+                for _ in range(self.bbox_head[-1].num_classes)
+            ]] * num_imgs
+
+            if self.with_mask:
+                mask_classes = self.mask_head.num_classes
+                segm_results = [[[] for _ in range(mask_classes)]
+                                for _ in range(num_imgs)]
+                results = list(zip(bbox_results, segm_results))
+            else:
+                results = bbox_results
+
+            return results
+
+        for i in range(self.num_stages):
+            bbox_head = self.bbox_head[i]
+            bbox_results = self._bbox_forward(
+                i,
+                x,
+                rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat)
+            # split batch bbox prediction back to each image
+            cls_score = bbox_results['cls_score']
+            bbox_pred = bbox_results['bbox_pred']
+            num_proposals_per_img = tuple(len(p) for p in proposal_list)
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_score = cls_score.split(num_proposals_per_img, 0)
+            bbox_pred = bbox_pred.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_score)
+
+            if i < self.num_stages - 1:
+                refine_rois_list = []
+                for j in range(num_imgs):
+                    if rois[j].shape[0] > 0:
+                        bbox_label = cls_score[j][:, :-1].argmax(dim=1)
+                        refine_rois = bbox_head.regress_by_class(
+                            rois[j], bbox_label, bbox_pred[j], img_metas[j])
+                        refine_rois_list.append(refine_rois)
+                rois = torch.cat(refine_rois_list)
+
+        # average scores of each image by stages
+        cls_score = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(num_imgs)
+        ]
+
+        # apply bbox post-processing to each image individually
+        det_bboxes = []
+        det_labels = []
+        for i in range(num_imgs):
+            det_bbox, det_label = self.bbox_head[-1].get_bboxes(
+                rois[i],
+                cls_score[i],
+                bbox_pred[i],
+                img_shapes[i],
+                scale_factors[i],
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+        det_bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head[-1].num_classes)
+            for i in range(num_imgs)
+        ]
+
+        if self.with_mask:
+            if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+                mask_classes = self.mask_head.num_classes
+                det_segm_results = [[[] for _ in range(mask_classes)]
+                                    for _ in range(num_imgs)]
+            else:
+                if rescale and not isinstance(scale_factors[0], float):
+                    scale_factors = [
+                        torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                        for scale_factor in scale_factors
+                    ]
+                _bboxes = [
+                    det_bboxes[i][:, :4] *
+                    scale_factors[i] if rescale else det_bboxes[i]
+                    for i in range(num_imgs)
+                ]
+                mask_rois = bbox2roi(_bboxes)
+
+                # get relay feature on mask_rois
+                bbox_results = self._bbox_forward(
+                    -1,
+                    x,
+                    mask_rois,
+                    semantic_feat=semantic_feat,
+                    glbctx_feat=glbctx_feat)
+                relayed_feat = bbox_results['relayed_feat']
+                relayed_feat = self.feat_relay_head(relayed_feat)
+
+                mask_results = self._mask_forward(
+                    x,
+                    mask_rois,
+                    semantic_feat=semantic_feat,
+                    glbctx_feat=glbctx_feat,
+                    relayed_feat=relayed_feat)
+                mask_pred = mask_results['mask_pred']
+
+                # split batch mask prediction back to each image
+                num_bbox_per_img = tuple(len(_bbox) for _bbox in _bboxes)
+                mask_preds = mask_pred.split(num_bbox_per_img, 0)
+
+                # apply mask post-processing to each image individually
+                det_segm_results = []
+                for i in range(num_imgs):
+                    if det_bboxes[i].shape[0] == 0:
+                        det_segm_results.append(
+                            [[] for _ in range(self.mask_head.num_classes)])
+                    else:
+                        segm_result = self.mask_head.get_seg_masks(
+                            mask_preds[i], _bboxes[i], det_labels[i],
+                            self.test_cfg, ori_shapes[i], scale_factors[i],
+                            rescale)
+                        det_segm_results.append(segm_result)
+
+        # return results
+        if self.with_mask:
+            return list(zip(det_bbox_results, det_segm_results))
+        else:
+            return det_bbox_results
+
+    def aug_test(self, img_feats, proposal_list, img_metas, rescale=False):
+        if self.with_semantic:
+            semantic_feats = [
+                self.semantic_head(feat)[1] for feat in img_feats
+            ]
+        else:
+            semantic_feats = [None] * len(img_metas)
+
+        if self.with_glbctx:
+            glbctx_feats = [self.glbctx_head(feat)[1] for feat in img_feats]
+        else:
+            glbctx_feats = [None] * len(img_metas)
+
+        rcnn_test_cfg = self.test_cfg
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta, semantic_feat, glbctx_feat in zip(
+                img_feats, img_metas, semantic_feats, glbctx_feats):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip)
+            # "ms" in variable names means multi-stage
+            ms_scores = []
+
+            rois = bbox2roi([proposals])
+
+            if rois.shape[0] == 0:
+                # There is no proposal in the single image
+                aug_bboxes.append(rois.new_zeros(0, 4))
+                aug_scores.append(rois.new_zeros(0, 1))
+                continue
+
+            for i in range(self.num_stages):
+                bbox_head = self.bbox_head[i]
+                bbox_results = self._bbox_forward(
+                    i,
+                    x,
+                    rois,
+                    semantic_feat=semantic_feat,
+                    glbctx_feat=glbctx_feat)
+                ms_scores.append(bbox_results['cls_score'])
+                if i < self.num_stages - 1:
+                    bbox_label = bbox_results['cls_score'].argmax(dim=1)
+                    rois = bbox_head.regress_by_class(
+                        rois, bbox_label, bbox_results['bbox_pred'],
+                        img_meta[0])
+
+            cls_score = sum(ms_scores) / float(len(ms_scores))
+            bboxes, scores = self.bbox_head[-1].get_bboxes(
+                rois,
+                cls_score,
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+
+        det_bbox_results = bbox2result(det_bboxes, det_labels,
+                                       self.bbox_head[-1].num_classes)
+
+        if self.with_mask:
+            if det_bboxes.shape[0] == 0:
+                det_segm_results = [[]
+                                    for _ in range(self.mask_head.num_classes)]
+            else:
+                aug_masks = []
+                for x, img_meta, semantic_feat, glbctx_feat in zip(
+                        img_feats, img_metas, semantic_feats, glbctx_feats):
+                    img_shape = img_meta[0]['img_shape']
+                    scale_factor = img_meta[0]['scale_factor']
+                    flip = img_meta[0]['flip']
+                    _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                           scale_factor, flip)
+                    mask_rois = bbox2roi([_bboxes])
+                    # get relay feature on mask_rois
+                    bbox_results = self._bbox_forward(
+                        -1,
+                        x,
+                        mask_rois,
+                        semantic_feat=semantic_feat,
+                        glbctx_feat=glbctx_feat)
+                    relayed_feat = bbox_results['relayed_feat']
+                    relayed_feat = self.feat_relay_head(relayed_feat)
+                    mask_results = self._mask_forward(
+                        x,
+                        mask_rois,
+                        semantic_feat=semantic_feat,
+                        glbctx_feat=glbctx_feat,
+                        relayed_feat=relayed_feat)
+                    mask_pred = mask_results['mask_pred']
+                    aug_masks.append(mask_pred.sigmoid().cpu().numpy())
+                merged_masks = merge_aug_masks(aug_masks, img_metas,
+                                               self.test_cfg)
+                ori_shape = img_metas[0][0]['ori_shape']
+                det_segm_results = self.mask_head.get_seg_masks(
+                    merged_masks,
+                    det_bboxes,
+                    det_labels,
+                    rcnn_test_cfg,
+                    ori_shape,
+                    scale_factor=1.0,
+                    rescale=False)
+            return [(det_bbox_results, det_segm_results)]
+        else:
+            return [det_bbox_results]
diff --git a/mmdet/models/roi_heads/shared_heads/__init__.py b/mmdet/models/roi_heads/shared_heads/__init__.py
new file mode 100755
index 0000000..d56636a
--- /dev/null
+++ b/mmdet/models/roi_heads/shared_heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .res_layer import ResLayer
+
+__all__ = ['ResLayer']
diff --git a/mmdet/models/roi_heads/shared_heads/res_layer.py b/mmdet/models/roi_heads/shared_heads/res_layer.py
new file mode 100755
index 0000000..bef00a0
--- /dev/null
+++ b/mmdet/models/roi_heads/shared_heads/res_layer.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.runner import BaseModule, auto_fp16
+
+from mmdet.models.backbones import ResNet
+from mmdet.models.builder import SHARED_HEADS
+from mmdet.models.utils import ResLayer as _ResLayer
+
+
+@SHARED_HEADS.register_module()
+class ResLayer(BaseModule):
+
+    def __init__(self,
+                 depth,
+                 stage=3,
+                 stride=2,
+                 dilation=1,
+                 style='pytorch',
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 with_cp=False,
+                 dcn=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResLayer, self).__init__(init_cfg)
+
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.stage = stage
+        self.fp16_enabled = False
+        block, stage_blocks = ResNet.arch_settings[depth]
+        stage_block = stage_blocks[stage]
+        planes = 64 * 2**stage
+        inplanes = 64 * 2**(stage - 1) * block.expansion
+
+        res_layer = _ResLayer(
+            block,
+            inplanes,
+            planes,
+            stage_block,
+            stride=stride,
+            dilation=dilation,
+            style=style,
+            with_cp=with_cp,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn)
+        self.add_module(f'layer{stage + 1}', res_layer)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    @auto_fp16()
+    def forward(self, x):
+        res_layer = getattr(self, f'layer{self.stage + 1}')
+        out = res_layer(x)
+        return out
+
+    def train(self, mode=True):
+        super(ResLayer, self).train(mode)
+        if self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/mmdet/models/roi_heads/sparse_roi_head.py b/mmdet/models/roi_heads/sparse_roi_head.py
new file mode 100755
index 0000000..2613469
--- /dev/null
+++ b/mmdet/models/roi_heads/sparse_roi_head.py
@@ -0,0 +1,424 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core import bbox2result, bbox2roi, bbox_xyxy_to_cxcywh
+from mmdet.core.bbox.samplers import PseudoSampler
+from ..builder import HEADS
+from .cascade_roi_head import CascadeRoIHead
+
+
+@HEADS.register_module()
+class SparseRoIHead(CascadeRoIHead):
+    r"""The RoIHead for `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+    and `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_stages (int): Number of stage whole iterative process.
+            Defaults to 6.
+        stage_loss_weights (Tuple[float]): The loss
+            weight of each stage. By default all stages have
+            the same weight 1.
+        bbox_roi_extractor (dict): Config of box roi extractor.
+        mask_roi_extractor (dict): Config of mask roi extractor.
+        bbox_head (dict): Config of box head.
+        mask_head (dict): Config of mask head.
+        train_cfg (dict, optional): Configuration information in train stage.
+            Defaults to None.
+        test_cfg (dict, optional): Configuration information in test stage.
+            Defaults to None.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    """
+
+    def __init__(self,
+                 num_stages=6,
+                 stage_loss_weights=(1, 1, 1, 1, 1, 1),
+                 proposal_feature_channel=256,
+                 bbox_roi_extractor=dict(
+                     type='SingleRoIExtractor',
+                     roi_layer=dict(
+                         type='RoIAlign', output_size=7, sampling_ratio=2),
+                     out_channels=256,
+                     featmap_strides=[4, 8, 16, 32]),
+                 mask_roi_extractor=None,
+                 bbox_head=dict(
+                     type='DIIHead',
+                     num_classes=80,
+                     num_fcs=2,
+                     num_heads=8,
+                     num_cls_fcs=1,
+                     num_reg_fcs=3,
+                     feedforward_channels=2048,
+                     hidden_channels=256,
+                     dropout=0.0,
+                     roi_feat_size=7,
+                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert len(stage_loss_weights) == num_stages
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        self.proposal_feature_channel = proposal_feature_channel
+        super(SparseRoIHead, self).__init__(
+            num_stages,
+            stage_loss_weights,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_roi_extractor=mask_roi_extractor,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+        # train_cfg would be None when run the test.py
+        if train_cfg is not None:
+            for stage in range(num_stages):
+                assert isinstance(self.bbox_sampler[stage], PseudoSampler), \
+                    'Sparse R-CNN and QueryInst only support `PseudoSampler`'
+
+    def _bbox_forward(self, stage, x, rois, object_feats, img_metas):
+        """Box head forward function used in both training and testing. Returns
+        all regression, classification results and a intermediate feature.
+
+        Args:
+            stage (int): The index of current stage in
+                iterative process.
+            x (List[Tensor]): List of FPN features
+            rois (Tensor): Rois in total batch. With shape (num_proposal, 5).
+                the last dimension 5 represents (img_index, x1, y1, x2, y2).
+            object_feats (Tensor): The object feature extracted from
+                the previous stage.
+            img_metas (dict): meta information of images.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of bbox head outputs,
+                Containing the following results:
+
+                    - cls_score (Tensor): The score of each class, has
+                      shape (batch_size, num_proposals, num_classes)
+                      when use focal loss or
+                      (batch_size, num_proposals, num_classes+1)
+                      otherwise.
+                    - decode_bbox_pred (Tensor): The regression results
+                      with shape (batch_size, num_proposal, 4).
+                      The last dimension 4 represents
+                      [tl_x, tl_y, br_x, br_y].
+                    - object_feats (Tensor): The object feature extracted
+                      from current stage
+                    - detach_cls_score_list (list[Tensor]): The detached
+                      classification results, length is batch_size, and
+                      each tensor has shape (num_proposal, num_classes).
+                    - detach_proposal_list (list[tensor]): The detached
+                      regression results, length is batch_size, and each
+                      tensor has shape (num_proposal, 4). The last
+                      dimension 4 represents [tl_x, tl_y, br_x, br_y].
+        """
+        num_imgs = len(img_metas)
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        cls_score, bbox_pred, object_feats, attn_feats = bbox_head(
+            bbox_feats, object_feats)
+        proposal_list = self.bbox_head[stage].refine_bboxes(
+            rois,
+            rois.new_zeros(len(rois)),  # dummy arg
+            bbox_pred.view(-1, bbox_pred.size(-1)),
+            [rois.new_zeros(object_feats.size(1)) for _ in range(num_imgs)],
+            img_metas)
+        bbox_results = dict(
+            cls_score=cls_score,
+            decode_bbox_pred=torch.cat(proposal_list),
+            object_feats=object_feats,
+            attn_feats=attn_feats,
+            # detach then use it in label assign
+            detach_cls_score_list=[
+                cls_score[i].detach() for i in range(num_imgs)
+            ],
+            detach_proposal_list=[item.detach() for item in proposal_list])
+
+        return bbox_results
+
+    def _mask_forward(self, stage, x, rois, attn_feats):
+        """Mask head forward function used in both training and testing."""
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_pred = mask_head(mask_feats, attn_feats)
+
+        mask_results = dict(mask_pred=mask_pred)
+        return mask_results
+
+    def _mask_forward_train(self, stage, x, attn_feats, sampling_results,
+                            gt_masks, rcnn_train_cfg):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        attn_feats = torch.cat([
+            feats[res.pos_inds]
+            for (feats, res) in zip(attn_feats, sampling_results)
+        ])
+        mask_results = self._mask_forward(stage, x, pos_rois, attn_feats)
+
+        mask_targets = self.mask_head[stage].get_targets(
+            sampling_results, gt_masks, rcnn_train_cfg)
+
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        loss_mask = self.mask_head[stage].loss(mask_results['mask_pred'],
+                                               mask_targets, pos_labels)
+        mask_results.update(loss_mask)
+        return mask_results
+
+    def forward_train(self,
+                      x,
+                      proposal_boxes,
+                      proposal_features,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      imgs_whwh=None,
+                      gt_masks=None):
+        """Forward function in training stage.
+
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+            proposals (Tensor): Decoded proposal bboxes, has shape
+                (batch_size, num_proposals, 4)
+            proposal_features (Tensor): Expanded proposal
+                features, has shape
+                (batch_size, num_proposals, proposal_feature_channel)
+            img_metas (list[dict]): list of image info dict where
+                each dict has: 'img_shape', 'scale_factor', 'flip',
+                and may also contain 'filename', 'ori_shape',
+                'pad_shape', and 'img_norm_cfg'. For details on the
+                values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            imgs_whwh (Tensor): Tensor with shape (batch_size, 4),
+                    the dimension means
+                    [img_width,img_height, img_width, img_height].
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components of all stage.
+        """
+
+        num_imgs = len(img_metas)
+        num_proposals = proposal_boxes.size(1)
+        imgs_whwh = imgs_whwh.repeat(1, num_proposals, 1)
+        all_stage_bbox_results = []
+        proposal_list = [proposal_boxes[i] for i in range(len(proposal_boxes))]
+        object_feats = proposal_features
+        all_stage_loss = {}
+        for stage in range(self.num_stages):
+            rois = bbox2roi(proposal_list)
+            bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                              img_metas)
+            all_stage_bbox_results.append(bbox_results)
+            if gt_bboxes_ignore is None:
+                # TODO support ignore
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            cls_pred_list = bbox_results['detach_cls_score_list']
+            proposal_list = bbox_results['detach_proposal_list']
+            for i in range(num_imgs):
+                normalize_bbox_ccwh = bbox_xyxy_to_cxcywh(proposal_list[i] /
+                                                          imgs_whwh[i])
+                assign_result = self.bbox_assigner[stage].assign(
+                    normalize_bbox_ccwh, cls_pred_list[i], gt_bboxes[i],
+                    gt_labels[i], img_metas[i])
+                sampling_result = self.bbox_sampler[stage].sample(
+                    assign_result, proposal_list[i], gt_bboxes[i])
+                sampling_results.append(sampling_result)
+            bbox_targets = self.bbox_head[stage].get_targets(
+                sampling_results, gt_bboxes, gt_labels, self.train_cfg[stage],
+                True)
+            cls_score = bbox_results['cls_score']
+            decode_bbox_pred = bbox_results['decode_bbox_pred']
+
+            single_stage_loss = self.bbox_head[stage].loss(
+                cls_score.view(-1, cls_score.size(-1)),
+                decode_bbox_pred.view(-1, 4),
+                *bbox_targets,
+                imgs_whwh=imgs_whwh)
+
+            if self.with_mask:
+                mask_results = self._mask_forward_train(
+                    stage, x, bbox_results['attn_feats'], sampling_results,
+                    gt_masks, self.train_cfg[stage])
+                single_stage_loss['loss_mask'] = mask_results['loss_mask']
+
+            for key, value in single_stage_loss.items():
+                all_stage_loss[f'stage{stage}_{key}'] = value * \
+                                    self.stage_loss_weights[stage]
+            object_feats = bbox_results['object_feats']
+
+        return all_stage_loss
+
+    def simple_test(self,
+                    x,
+                    proposal_boxes,
+                    proposal_features,
+                    img_metas,
+                    imgs_whwh,
+                    rescale=False):
+        """Test without augmentation.
+
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+            proposal_boxes (Tensor): Decoded proposal bboxes, has shape
+                (batch_size, num_proposals, 4)
+            proposal_features (Tensor): Expanded proposal
+                features, has shape
+                (batch_size, num_proposals, proposal_feature_channel)
+            img_metas (dict): meta information of images.
+            imgs_whwh (Tensor): Tensor with shape (batch_size, 4),
+                    the dimension means
+                    [img_width,img_height, img_width, img_height].
+            rescale (bool): If True, return boxes in original image
+                space. Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]] or list[tuple]: When no mask branch,
+            it is bbox results of each image and classes with type
+            `list[list[np.ndarray]]`. The outer list
+            corresponds to each image. The inner list
+            corresponds to each class. When the model has a mask branch,
+            it is a list[tuple] that contains bbox results and mask results.
+            The outer list corresponds to each image, and first element
+            of tuple is bbox results, second element is mask results.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        # Decode initial proposals
+        num_imgs = len(img_metas)
+        proposal_list = [proposal_boxes[i] for i in range(num_imgs)]
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        object_feats = proposal_features
+        if all([proposal.shape[0] == 0 for proposal in proposal_list]):
+            # There is no proposal in the whole batch
+            bbox_results = [[
+                np.zeros((0, 5), dtype=np.float32)
+                for i in range(self.bbox_head[-1].num_classes)
+            ]] * num_imgs
+            return bbox_results
+
+        for stage in range(self.num_stages):
+            rois = bbox2roi(proposal_list)
+            bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                              img_metas)
+            object_feats = bbox_results['object_feats']
+            cls_score = bbox_results['cls_score']
+            proposal_list = bbox_results['detach_proposal_list']
+
+        if self.with_mask:
+            rois = bbox2roi(proposal_list)
+            mask_results = self._mask_forward(stage, x, rois,
+                                              bbox_results['attn_feats'])
+            mask_results['mask_pred'] = mask_results['mask_pred'].reshape(
+                num_imgs, -1, *mask_results['mask_pred'].size()[1:])
+
+        num_classes = self.bbox_head[-1].num_classes
+        det_bboxes = []
+        det_labels = []
+
+        if self.bbox_head[-1].loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+        else:
+            cls_score = cls_score.softmax(-1)[..., :-1]
+
+        for img_id in range(num_imgs):
+            cls_score_per_img = cls_score[img_id]
+            scores_per_img, topk_indices = cls_score_per_img.flatten(
+                0, 1).topk(
+                    self.test_cfg.max_per_img, sorted=False)
+            labels_per_img = topk_indices % num_classes
+            bbox_pred_per_img = proposal_list[img_id][topk_indices //
+                                                      num_classes]
+            if rescale:
+                scale_factor = img_metas[img_id]['scale_factor']
+                bbox_pred_per_img /= bbox_pred_per_img.new_tensor(scale_factor)
+            det_bboxes.append(
+                torch.cat([bbox_pred_per_img, scores_per_img[:, None]], dim=1))
+            det_labels.append(labels_per_img)
+
+        bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i], num_classes)
+            for i in range(num_imgs)
+        ]
+
+        if self.with_mask:
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4] *
+                scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                for i in range(len(det_bboxes))
+            ]
+            segm_results = []
+            mask_pred = mask_results['mask_pred']
+            for img_id in range(num_imgs):
+                mask_pred_per_img = mask_pred[img_id].flatten(0,
+                                                              1)[topk_indices]
+                mask_pred_per_img = mask_pred_per_img[:, None, ...].repeat(
+                    1, num_classes, 1, 1)
+                segm_result = self.mask_head[-1].get_seg_masks(
+                    mask_pred_per_img, _bboxes[img_id], det_labels[img_id],
+                    self.test_cfg, ori_shapes[img_id], scale_factors[img_id],
+                    rescale)
+                segm_results.append(segm_result)
+
+        if self.with_mask:
+            results = list(zip(bbox_results, segm_results))
+        else:
+            results = bbox_results
+
+        return results
+
+    def aug_test(self, features, proposal_list, img_metas, rescale=False):
+        raise NotImplementedError(
+            'Sparse R-CNN and QueryInst does not support `aug_test`')
+
+    def forward_dummy(self, x, proposal_boxes, proposal_features, img_metas):
+        """Dummy forward function when do the flops computing."""
+        all_stage_bbox_results = []
+        proposal_list = [proposal_boxes[i] for i in range(len(proposal_boxes))]
+        object_feats = proposal_features
+        if self.with_bbox:
+            for stage in range(self.num_stages):
+                rois = bbox2roi(proposal_list)
+                bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                                  img_metas)
+
+                all_stage_bbox_results.append((bbox_results, ))
+                proposal_list = bbox_results['detach_proposal_list']
+                object_feats = bbox_results['object_feats']
+
+                if self.with_mask:
+                    rois = bbox2roi(proposal_list)
+                    mask_results = self._mask_forward(
+                        stage, x, rois, bbox_results['attn_feats'])
+                    all_stage_bbox_results[-1] += (mask_results, )
+        return all_stage_bbox_results
diff --git a/mmdet/models/roi_heads/standard_roi_head.py b/mmdet/models/roi_heads/standard_roi_head.py
new file mode 100755
index 0000000..3fdd82a
--- /dev/null
+++ b/mmdet/models/roi_heads/standard_roi_head.py
@@ -0,0 +1,397 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import bbox2result, bbox2roi, build_assigner, build_sampler
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_roi_head import BaseRoIHead
+from .test_mixins import BBoxTestMixin, MaskTestMixin
+
+
+@HEADS.register_module()
+class StandardRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin):
+    """Simplest base roi head including one bbox head and one mask head."""
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+            self.bbox_sampler = build_sampler(
+                self.train_cfg.sampler, context=self)
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+        """Initialize ``bbox_head``"""
+        self.bbox_roi_extractor = build_roi_extractor(bbox_roi_extractor)
+        self.bbox_head = build_head(bbox_head)
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+        """Initialize ``mask_head``"""
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = build_roi_extractor(mask_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+        self.mask_head = build_head(mask_head)
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        # bbox head
+        outs = ()
+        rois = bbox2roi([proposals])
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            outs = outs + (bbox_results['cls_score'],
+                           bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            outs = outs + (mask_results['mask_pred'], )
+        return outs
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            proposals (list[Tensors]): list of region proposals.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # assign gts and sample proposals
+        if self.with_bbox or self.with_mask:
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            for i in range(num_imgs):
+                assign_result = self.bbox_assigner.assign(
+                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
+                    gt_labels[i])
+                sampling_result = self.bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(x, sampling_results,
+                                                    gt_bboxes, gt_labels,
+                                                    img_metas)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self._mask_forward_train(x, sampling_results,
+                                                    bbox_results['bbox_feats'],
+                                                    gt_masks, img_metas)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def _bbox_forward(self, x, rois):
+        """Box head forward function used in both training and testing."""
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+        """Run forward function and calculate loss for box head in training."""
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.train_cfg)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_bboxes.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_bboxes.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_targets = self.mask_head.get_targets(sampling_results, gt_masks,
+                                                  self.train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = self.mask_head.loss(mask_results['mask_pred'],
+                                        mask_targets, pos_labels)
+
+        mask_results.update(loss_mask=loss_mask, mask_targets=mask_targets)
+        return mask_results
+
+    def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
+        """Mask head forward function used in both training and testing."""
+        assert ((rois is not None) ^
+                (pos_inds is not None and bbox_feats is not None))
+        if rois is not None:
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+        else:
+            assert bbox_feats is not None
+            mask_feats = bbox_feats[pos_inds]
+
+        mask_pred = self.mask_head(mask_feats)
+        mask_results = dict(mask_pred=mask_pred, mask_feats=mask_feats)
+        return mask_results
+
+    async def async_simple_test(self,
+                                x,
+                                proposal_list,
+                                img_metas,
+                                proposals=None,
+                                rescale=False):
+        """Async test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+
+        det_bboxes, det_labels = await self.async_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=rescale)
+        bbox_results = bbox2result(det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = await self.async_test_mask(
+                x,
+                img_metas,
+                det_bboxes,
+                det_labels,
+                rescale=rescale,
+                mask_test_cfg=self.test_cfg.get('mask'))
+            return bbox_results, segm_results
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (batch_size, c, h, w).
+            proposal_list (list(Tensor)): Proposals from rpn head.
+                Each has shape (num_proposals, 5), last dimension
+                5 represent (x1, y1, x2, y2, score).
+            img_metas (list[dict]): Meta information of images.
+            rescale (bool): Whether to rescale the results to
+                the original image. Default: True.
+
+        Returns:
+            list[list[np.ndarray]] or list[tuple]: When no mask branch,
+            it is bbox results of each image and classes with type
+            `list[list[np.ndarray]]`. The outer list
+            corresponds to each image. The inner list
+            corresponds to each class. When the model has mask branch,
+            it contains bbox results and mask results.
+            The outer list corresponds to each image, and first element
+            of tuple is bbox results, second element is mask results.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=rescale)
+
+        bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head.num_classes)
+            for i in range(len(det_bboxes))
+        ]
+
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = self.simple_test_mask(
+                x, img_metas, det_bboxes, det_labels, rescale=rescale)
+            return list(zip(bbox_results, segm_results))
+
+    def aug_test(self, x, proposal_list, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        det_bboxes, det_labels = self.aug_test_bboxes(x, img_metas,
+                                                      proposal_list,
+                                                      self.test_cfg)
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+        bbox_results = bbox2result(_det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+
+        # det_bboxes always keep the original scale
+        if self.with_mask:
+            segm_results = self.aug_test_mask(x, img_metas, det_bboxes,
+                                              det_labels)
+            return [(bbox_results, segm_results)]
+        else:
+            return [bbox_results]
+
+    def onnx_export(self, x, proposals, img_metas, rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        det_bboxes, det_labels = self.bbox_onnx_export(
+            x, img_metas, proposals, self.test_cfg, rescale=rescale)
+
+        if not self.with_mask:
+            return det_bboxes, det_labels
+        else:
+            segm_results = self.mask_onnx_export(
+                x, img_metas, det_bboxes, det_labels, rescale=rescale)
+            return det_bboxes, det_labels, segm_results
+
+    def mask_onnx_export(self, x, img_metas, det_bboxes, det_labels, **kwargs):
+        """Export mask branch to onnx which supports batch inference.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            img_metas (list[dict]): Image meta info.
+            det_bboxes (Tensor): Bboxes and corresponding scores.
+                has shape [N, num_bboxes, 5].
+            det_labels (Tensor): class labels of
+                shape [N, num_bboxes].
+
+        Returns:
+            Tensor: The segmentation results of shape [N, num_bboxes,
+                image_height, image_width].
+        """
+        # image shapes of images in the batch
+
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            raise RuntimeError('[ONNX Error] Can not record MaskHead '
+                               'as it has not been executed this time')
+        batch_size = det_bboxes.size(0)
+        # if det_bboxes is rescaled to the original image size, we need to
+        # rescale it back to the testing scale to obtain RoIs.
+        det_bboxes = det_bboxes[..., :4]
+        batch_index = torch.arange(
+            det_bboxes.size(0), device=det_bboxes.device).float().view(
+                -1, 1, 1).expand(det_bboxes.size(0), det_bboxes.size(1), 1)
+        mask_rois = torch.cat([batch_index, det_bboxes], dim=-1)
+        mask_rois = mask_rois.view(-1, 5)
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_pred = mask_results['mask_pred']
+        max_shape = img_metas[0]['img_shape_for_onnx']
+        num_det = det_bboxes.shape[1]
+        det_bboxes = det_bboxes.reshape(-1, 4)
+        det_labels = det_labels.reshape(-1)
+        segm_results = self.mask_head.onnx_export(mask_pred, det_bboxes,
+                                                  det_labels, self.test_cfg,
+                                                  max_shape)
+        segm_results = segm_results.reshape(batch_size, num_det, max_shape[0],
+                                            max_shape[1])
+        return segm_results
+
+    def bbox_onnx_export(self, x, img_metas, proposals, rcnn_test_cfg,
+                         **kwargs):
+        """Export bbox branch to onnx which supports batch inference.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            img_metas (list[dict]): Image meta info.
+            proposals (Tensor): Region proposals with
+                batch dimension, has shape [N, num_bboxes, 5].
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+
+        Returns:
+            tuple[Tensor, Tensor]: bboxes of shape [N, num_bboxes, 5]
+                and class labels of shape [N, num_bboxes].
+        """
+        # get origin input shape to support onnx dynamic input shape
+        assert len(
+            img_metas
+        ) == 1, 'Only support one input image while in exporting to ONNX'
+        img_shapes = img_metas[0]['img_shape_for_onnx']
+
+        rois = proposals
+
+        batch_index = torch.arange(
+            rois.size(0), device=rois.device).float().view(-1, 1, 1).expand(
+                rois.size(0), rois.size(1), 1)
+
+        rois = torch.cat([batch_index, rois[..., :4]], dim=-1)
+        batch_size = rois.shape[0]
+        num_proposals_per_img = rois.shape[1]
+
+        # Eliminate the batch dimension
+        rois = rois.view(-1, 5)
+        bbox_results = self._bbox_forward(x, rois)
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+
+        # Recover the batch dimension
+        rois = rois.reshape(batch_size, num_proposals_per_img, rois.size(-1))
+        cls_score = cls_score.reshape(batch_size, num_proposals_per_img,
+                                      cls_score.size(-1))
+
+        bbox_pred = bbox_pred.reshape(batch_size, num_proposals_per_img,
+                                      bbox_pred.size(-1))
+        det_bboxes, det_labels = self.bbox_head.onnx_export(
+            rois, cls_score, bbox_pred, img_shapes, cfg=rcnn_test_cfg)
+
+        return det_bboxes, det_labels
diff --git a/mmdet/models/roi_heads/test_mixins.py b/mmdet/models/roi_heads/test_mixins.py
new file mode 100755
index 0000000..ae6e79a
--- /dev/null
+++ b/mmdet/models/roi_heads/test_mixins.py
@@ -0,0 +1,311 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+import warnings
+
+import numpy as np
+import torch
+
+from mmdet.core import (bbox2roi, bbox_mapping, merge_aug_bboxes,
+                        merge_aug_masks, multiclass_nms)
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin:
+
+    if sys.version_info >= (3, 7):
+
+        async def async_test_bboxes(self,
+                                    x,
+                                    img_metas,
+                                    proposals,
+                                    rcnn_test_cfg,
+                                    rescale=False,
+                                    **kwargs):
+            """Asynchronized test for box head without augmentation."""
+            rois = bbox2roi(proposals)
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            if self.with_shared_head:
+                roi_feats = self.shared_head(roi_feats)
+            sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017)
+
+            async with completed(
+                    __name__, 'bbox_head_forward',
+                    sleep_interval=sleep_interval):
+                cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            img_shape = img_metas[0]['img_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            det_bboxes, det_labels = self.bbox_head.get_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            return det_bboxes, det_labels
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_metas,
+                           proposals,
+                           rcnn_test_cfg,
+                           rescale=False):
+        """Test only det bboxes without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            img_metas (list[dict]): Image meta info.
+            proposals (List[Tensor]): Region proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: The first list contains
+                the boxes of the corresponding image in a batch, each
+                tensor has the shape (num_boxes, 5) and last dimension
+                5 represent (tl_x, tl_y, br_x, br_y, score). Each Tensor
+                in the second list is the labels with shape (num_boxes, ).
+                The length of both lists should be equal to batch_size.
+        """
+
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            batch_size = len(proposals)
+            det_bbox = rois.new_zeros(0, 5)
+            det_label = rois.new_zeros((0, ), dtype=torch.long)
+            if rcnn_test_cfg is None:
+                det_bbox = det_bbox[:, :4]
+                det_label = rois.new_zeros(
+                    (0, self.bbox_head.fc_cls.out_features))
+            # There is no proposal in the whole batch
+            return [det_bbox] * batch_size, [det_label] * batch_size
+
+        bbox_results = self._bbox_forward(x, rois)
+        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        # split batch bbox prediction back to each image
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_score = cls_score.split(num_proposals_per_img, 0)
+
+        # some detector with_reg is False, bbox_pred will be None
+        if bbox_pred is not None:
+            # TODO move this to a sabl_roi_head
+            # the bbox prediction of some detectors like SABL is not Tensor
+            if isinstance(bbox_pred, torch.Tensor):
+                bbox_pred = bbox_pred.split(num_proposals_per_img, 0)
+            else:
+                bbox_pred = self.bbox_head.bbox_pred_split(
+                    bbox_pred, num_proposals_per_img)
+        else:
+            bbox_pred = (None, ) * len(proposals)
+
+        # apply bbox post-processing to each image individually
+        det_bboxes = []
+        det_labels = []
+        for i in range(len(proposals)):
+            if rois[i].shape[0] == 0:
+                # There is no proposal in the single image
+                det_bbox = rois[i].new_zeros(0, 5)
+                det_label = rois[i].new_zeros((0, ), dtype=torch.long)
+                if rcnn_test_cfg is None:
+                    det_bbox = det_bbox[:, :4]
+                    det_label = rois[i].new_zeros(
+                        (0, self.bbox_head.fc_cls.out_features))
+
+            else:
+                det_bbox, det_label = self.bbox_head.get_bboxes(
+                    rois[i],
+                    cls_score[i],
+                    bbox_pred[i],
+                    img_shapes[i],
+                    scale_factors[i],
+                    rescale=rescale,
+                    cfg=rcnn_test_cfg)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+        return det_bboxes, det_labels
+
+    def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
+        """Test det bboxes with test time augmentation."""
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+            # TODO more flexible
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            rois = bbox2roi([proposals])
+            bbox_results = self._bbox_forward(x, rois)
+            bboxes, scores = self.bbox_head.get_bboxes(
+                rois,
+                bbox_results['cls_score'],
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        if merged_bboxes.shape[0] == 0:
+            # There is no proposal in the single image
+            det_bboxes = merged_bboxes.new_zeros(0, 5)
+            det_labels = merged_bboxes.new_zeros((0, ), dtype=torch.long)
+        else:
+            det_bboxes, det_labels = multiclass_nms(merged_bboxes,
+                                                    merged_scores,
+                                                    rcnn_test_cfg.score_thr,
+                                                    rcnn_test_cfg.nms,
+                                                    rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin:
+
+    if sys.version_info >= (3, 7):
+
+        async def async_test_mask(self,
+                                  x,
+                                  img_metas,
+                                  det_bboxes,
+                                  det_labels,
+                                  rescale=False,
+                                  mask_test_cfg=None):
+            """Asynchronized test for mask head without augmentation."""
+            # image shape of the first image in the batch (only one)
+            ori_shape = img_metas[0]['ori_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[] for _ in range(self.mask_head.num_classes)]
+            else:
+                if rescale and not isinstance(scale_factor,
+                                              (float, torch.Tensor)):
+                    scale_factor = det_bboxes.new_tensor(scale_factor)
+                _bboxes = (
+                    det_bboxes[:, :4] *
+                    scale_factor if rescale else det_bboxes)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+                if mask_test_cfg and mask_test_cfg.get('async_sleep_interval'):
+                    sleep_interval = mask_test_cfg['async_sleep_interval']
+                else:
+                    sleep_interval = 0.035
+                async with completed(
+                        __name__,
+                        'mask_head_forward',
+                        sleep_interval=sleep_interval):
+                    mask_pred = self.mask_head(mask_feats)
+                segm_result = self.mask_head.get_seg_masks(
+                    mask_pred, _bboxes, det_labels, self.test_cfg, ori_shape,
+                    scale_factor, rescale)
+            return segm_result
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Simple test for mask head without augmentation."""
+        # image shapes of images in the batch
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        if isinstance(scale_factors[0], float):
+            warnings.warn(
+                'Scale factor in img_metas should be a '
+                'ndarray with shape (4,) '
+                'arrange as (factor_w, factor_h, factor_w, factor_h), '
+                'The scale_factor with float type has been deprecated. ')
+            scale_factors = np.array([scale_factors] * 4, dtype=np.float32)
+
+        num_imgs = len(det_bboxes)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            segm_results = [[[] for _ in range(self.mask_head.num_classes)]
+                            for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale:
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4] *
+                scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                for i in range(len(det_bboxes))
+            ]
+            mask_rois = bbox2roi(_bboxes)
+            mask_results = self._mask_forward(x, mask_rois)
+            mask_pred = mask_results['mask_pred']
+            # split batch mask prediction back to each image
+            num_mask_roi_per_img = [len(det_bbox) for det_bbox in det_bboxes]
+            mask_preds = mask_pred.split(num_mask_roi_per_img, 0)
+
+            # apply mask post-processing to each image individually
+            segm_results = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_preds[i], _bboxes[i], det_labels[i],
+                        self.test_cfg, ori_shapes[i], scale_factors[i],
+                        rescale)
+                    segm_results.append(segm_result)
+        return segm_results
+
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        """Test for mask head with test time augmentation."""
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                flip_direction = img_meta[0]['flip_direction']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip, flip_direction)
+                mask_rois = bbox2roi([_bboxes])
+                mask_results = self._mask_forward(x, mask_rois)
+                # convert to numpy array to save memory
+                aug_masks.append(
+                    mask_results['mask_pred'].sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            scale_factor = det_bboxes.new_ones(4)
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg,
+                ori_shape,
+                scale_factor=scale_factor,
+                rescale=False)
+        return segm_result
diff --git a/mmdet/models/roi_heads/trident_roi_head.py b/mmdet/models/roi_heads/trident_roi_head.py
new file mode 100755
index 0000000..0975879
--- /dev/null
+++ b/mmdet/models/roi_heads/trident_roi_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import batched_nms
+
+from mmdet.core import (bbox2result, bbox2roi, bbox_mapping, merge_aug_bboxes,
+                        multiclass_nms)
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class TridentRoIHead(StandardRoIHead):
+    """Trident roi head.
+
+    Args:
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+    """
+
+    def __init__(self, num_branch, test_branch_idx, **kwargs):
+        self.num_branch = num_branch
+        self.test_branch_idx = test_branch_idx
+        super(TridentRoIHead, self).__init__(**kwargs)
+
+    def merge_trident_bboxes(self, trident_det_bboxes, trident_det_labels):
+        """Merge bbox predictions of each branch."""
+        if trident_det_bboxes.numel() == 0:
+            det_bboxes = trident_det_bboxes.new_zeros((0, 5))
+            det_labels = trident_det_bboxes.new_zeros((0, ), dtype=torch.long)
+        else:
+            nms_bboxes = trident_det_bboxes[:, :4]
+            nms_scores = trident_det_bboxes[:, 4].contiguous()
+            nms_inds = trident_det_labels
+            nms_cfg = self.test_cfg['nms']
+            det_bboxes, keep = batched_nms(nms_bboxes, nms_scores, nms_inds,
+                                           nms_cfg)
+            det_labels = trident_det_labels[keep]
+            if self.test_cfg['max_per_img'] > 0:
+                det_labels = det_labels[:self.test_cfg['max_per_img']]
+                det_bboxes = det_bboxes[:self.test_cfg['max_per_img']]
+
+        return det_bboxes, det_labels
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation as follows:
+
+        1. Compute prediction bbox and label per branch.
+        2. Merge predictions of each branch according to scores of
+           bboxes, i.e., bboxes with higher score are kept to give
+           top-k prediction.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        det_bboxes_list, det_labels_list = self.simple_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=rescale)
+        num_branch = self.num_branch if self.test_branch_idx == -1 else 1
+        for _ in range(len(det_bboxes_list)):
+            if det_bboxes_list[_].shape[0] == 0:
+                det_bboxes_list[_] = det_bboxes_list[_].new_empty((0, 5))
+        det_bboxes, det_labels = [], []
+        for i in range(len(img_metas) // num_branch):
+            det_result = self.merge_trident_bboxes(
+                torch.cat(det_bboxes_list[i * num_branch:(i + 1) *
+                                          num_branch]),
+                torch.cat(det_labels_list[i * num_branch:(i + 1) *
+                                          num_branch]))
+            det_bboxes.append(det_result[0])
+            det_labels.append(det_result[1])
+
+        bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head.num_classes)
+            for i in range(len(det_bboxes))
+        ]
+        return bbox_results
+
+    def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
+        """Test det bboxes with test time augmentation."""
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+
+            trident_bboxes, trident_scores = [], []
+            for branch_idx in range(len(proposal_list)):
+                proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                         scale_factor, flip, flip_direction)
+                rois = bbox2roi([proposals])
+                bbox_results = self._bbox_forward(x, rois)
+                bboxes, scores = self.bbox_head.get_bboxes(
+                    rois,
+                    bbox_results['cls_score'],
+                    bbox_results['bbox_pred'],
+                    img_shape,
+                    scale_factor,
+                    rescale=False,
+                    cfg=None)
+                trident_bboxes.append(bboxes)
+                trident_scores.append(scores)
+
+            aug_bboxes.append(torch.cat(trident_bboxes, 0))
+            aug_scores.append(torch.cat(trident_scores, 0))
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
diff --git a/mmdet/models/seg_heads/__init__.py b/mmdet/models/seg_heads/__init__.py
new file mode 100755
index 0000000..b489a90
--- /dev/null
+++ b/mmdet/models/seg_heads/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .panoptic_fpn_head import PanopticFPNHead  # noqa: F401,F403
+from .panoptic_fusion_heads import *  # noqa: F401,F403
diff --git a/mmdet/models/seg_heads/base_semantic_head.py b/mmdet/models/seg_heads/base_semantic_head.py
new file mode 100755
index 0000000..2b6ca14
--- /dev/null
+++ b/mmdet/models/seg_heads/base_semantic_head.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch.nn.functional as F
+from mmcv.runner import BaseModule, force_fp32
+
+from ..builder import build_loss
+from ..utils import interpolate_as
+
+
+class BaseSemanticHead(BaseModule, metaclass=ABCMeta):
+    """Base module of Semantic Head.
+
+    Args:
+        num_classes (int): the number of classes.
+        init_cfg (dict): the initialization config.
+        loss_seg (dict): the loss of the semantic head.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 init_cfg=None,
+                 loss_seg=dict(
+                     type='CrossEntropyLoss',
+                     ignore_index=255,
+                     loss_weight=1.0)):
+        super(BaseSemanticHead, self).__init__(init_cfg)
+        self.loss_seg = build_loss(loss_seg)
+        self.num_classes = num_classes
+
+    @force_fp32(apply_to=('seg_preds', ))
+    def loss(self, seg_preds, gt_semantic_seg):
+        """Get the loss of semantic head.
+
+        Args:
+            seg_preds (Tensor): The input logits with the shape (N, C, H, W).
+            gt_semantic_seg: The ground truth of semantic segmentation with
+                the shape (N, H, W).
+            label_bias: The starting number of the semantic label.
+                Default: 1.
+
+        Returns:
+            dict: the loss of semantic head.
+        """
+        if seg_preds.shape[-2:] != gt_semantic_seg.shape[-2:]:
+            seg_preds = interpolate_as(seg_preds, gt_semantic_seg)
+        seg_preds = seg_preds.permute((0, 2, 3, 1))
+
+        loss_seg = self.loss_seg(
+            seg_preds.reshape(-1, self.num_classes),  # => [NxHxW, C]
+            gt_semantic_seg.reshape(-1).long())
+        return dict(loss_seg=loss_seg)
+
+    @abstractmethod
+    def forward(self, x):
+        """Placeholder of forward function.
+
+        Returns:
+            dict[str, Tensor]: A dictionary, including features
+                and predicted scores. Required keys: 'seg_preds'
+                and 'feats'.
+        """
+        pass
+
+    def forward_train(self, x, gt_semantic_seg):
+        output = self.forward(x)
+        seg_preds = output['seg_preds']
+        return self.loss(seg_preds, gt_semantic_seg)
+
+    def simple_test(self, x, img_metas, rescale=False):
+        output = self.forward(x)
+        seg_preds = output['seg_preds']
+        seg_preds = F.interpolate(
+            seg_preds,
+            size=img_metas[0]['pad_shape'][:2],
+            mode='bilinear',
+            align_corners=False)
+
+        if rescale:
+            h, w, _ = img_metas[0]['img_shape']
+            seg_preds = seg_preds[:, :, :h, :w]
+
+            h, w, _ = img_metas[0]['ori_shape']
+            seg_preds = F.interpolate(
+                seg_preds, size=(h, w), mode='bilinear', align_corners=False)
+        return seg_preds
diff --git a/mmdet/models/seg_heads/panoptic_fpn_head.py b/mmdet/models/seg_heads/panoptic_fpn_head.py
new file mode 100755
index 0000000..f1df297
--- /dev/null
+++ b/mmdet/models/seg_heads/panoptic_fpn_head.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.runner import ModuleList
+
+from ..builder import HEADS
+from ..utils import ConvUpsample
+from .base_semantic_head import BaseSemanticHead
+
+
+@HEADS.register_module()
+class PanopticFPNHead(BaseSemanticHead):
+    """PanopticFPNHead used in Panoptic FPN.
+
+    In this head, the number of output channels is ``num_stuff_classes
+    + 1``, including all stuff classes and one thing class. The stuff
+    classes will be reset from ``0`` to ``num_stuff_classes - 1``, the
+    thing classes will be merged to ``num_stuff_classes``-th channel.
+
+    Arg:
+        num_things_classes (int): Number of thing classes. Default: 80.
+        num_stuff_classes (int): Number of stuff classes. Default: 53.
+        num_classes (int): Number of classes, including all stuff
+            classes and one thing class. This argument is deprecated,
+            please use ``num_things_classes`` and ``num_stuff_classes``.
+            The module will automatically infer the num_classes by
+            ``num_stuff_classes + 1``.
+        in_channels (int): Number of channels in the input feature
+            map.
+        inner_channels (int): Number of channels in inner features.
+        start_level (int): The start level of the input features
+            used in PanopticFPN.
+        end_level (int): The end level of the used features, the
+            ``end_level``-th layer will not be used.
+        fg_range (tuple): Range of the foreground classes. It starts
+            from ``0`` to ``num_things_classes-1``. Deprecated, please use
+             ``num_things_classes`` directly.
+        bg_range (tuple): Range of the background classes. It starts
+            from ``num_things_classes`` to ``num_things_classes +
+            num_stuff_classes - 1``. Deprecated, please use
+            ``num_stuff_classes`` and ``num_things_classes`` directly.
+        conv_cfg (dict): Dictionary to construct and config
+            conv layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Use ``GN`` by default.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+        loss_seg (dict): the loss of the semantic head.
+    """
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_classes=None,
+                 in_channels=256,
+                 inner_channels=128,
+                 start_level=0,
+                 end_level=4,
+                 fg_range=None,
+                 bg_range=None,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 init_cfg=None,
+                 loss_seg=dict(
+                     type='CrossEntropyLoss', ignore_index=-1,
+                     loss_weight=1.0)):
+        if num_classes is not None:
+            warnings.warn(
+                '`num_classes` is deprecated now, please set '
+                '`num_stuff_classes` directly, the `num_classes` will be '
+                'set to `num_stuff_classes + 1`')
+            # num_classes = num_stuff_classes + 1 for PanopticFPN.
+            assert num_classes == num_stuff_classes + 1
+        super(PanopticFPNHead, self).__init__(num_stuff_classes + 1, init_cfg,
+                                              loss_seg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        if fg_range is not None and bg_range is not None:
+            self.fg_range = fg_range
+            self.bg_range = bg_range
+            self.num_things_classes = fg_range[1] - fg_range[0] + 1
+            self.num_stuff_classes = bg_range[1] - bg_range[0] + 1
+            warnings.warn(
+                '`fg_range` and `bg_range` are deprecated now, '
+                f'please use `num_things_classes`={self.num_things_classes} '
+                f'and `num_stuff_classes`={self.num_stuff_classes} instead.')
+
+        # Used feature layers are [start_level, end_level)
+        self.start_level = start_level
+        self.end_level = end_level
+        self.num_stages = end_level - start_level
+        self.inner_channels = inner_channels
+
+        self.conv_upsample_layers = ModuleList()
+        for i in range(start_level, end_level):
+            self.conv_upsample_layers.append(
+                ConvUpsample(
+                    in_channels,
+                    inner_channels,
+                    num_layers=i if i > 0 else 1,
+                    num_upsample=i if i > 0 else 0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                ))
+        self.conv_logits = nn.Conv2d(inner_channels, self.num_classes, 1)
+
+    def _set_things_to_void(self, gt_semantic_seg):
+        """Merge thing classes to one class.
+
+        In PanopticFPN, the background labels will be reset from `0` to
+        `self.num_stuff_classes-1`, the foreground labels will be merged to
+        `self.num_stuff_classes`-th channel.
+        """
+        gt_semantic_seg = gt_semantic_seg.int()
+        fg_mask = gt_semantic_seg < self.num_things_classes
+        bg_mask = (gt_semantic_seg >= self.num_things_classes) * (
+            gt_semantic_seg < self.num_things_classes + self.num_stuff_classes)
+
+        new_gt_seg = torch.clone(gt_semantic_seg)
+        new_gt_seg = torch.where(bg_mask,
+                                 gt_semantic_seg - self.num_things_classes,
+                                 new_gt_seg)
+        new_gt_seg = torch.where(fg_mask,
+                                 fg_mask.int() * self.num_stuff_classes,
+                                 new_gt_seg)
+        return new_gt_seg
+
+    def loss(self, seg_preds, gt_semantic_seg):
+        """The loss of PanopticFPN head.
+
+        Things classes will be merged to one class in PanopticFPN.
+        """
+        gt_semantic_seg = self._set_things_to_void(gt_semantic_seg)
+        return super().loss(seg_preds, gt_semantic_seg)
+
+    def init_weights(self):
+        super().init_weights()
+        nn.init.normal_(self.conv_logits.weight.data, 0, 0.01)
+        self.conv_logits.bias.data.zero_()
+
+    def forward(self, x):
+        # the number of subnets must be not more than
+        # the length of features.
+        assert self.num_stages <= len(x)
+
+        feats = []
+        for i, layer in enumerate(self.conv_upsample_layers):
+            f = layer(x[self.start_level + i])
+            feats.append(f)
+
+        feats = torch.sum(torch.stack(feats, dim=0), dim=0)
+        seg_preds = self.conv_logits(feats)
+        out = dict(seg_preds=seg_preds, feats=feats)
+        return out
diff --git a/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py b/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
new file mode 100755
index 0000000..41625a6
--- /dev/null
+++ b/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_panoptic_fusion_head import \
+    BasePanopticFusionHead  # noqa: F401,F403
+from .heuristic_fusion_head import HeuristicFusionHead  # noqa: F401,F403
+from .maskformer_fusion_head import MaskFormerFusionHead  # noqa: F401,F403
diff --git a/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py b/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
new file mode 100755
index 0000000..a38ac1c
--- /dev/null
+++ b/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+
+from ...builder import build_loss
+
+
+class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
+    """Base class for panoptic heads."""
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(BasePanopticFusionHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_things_classes + num_stuff_classes
+        self.test_cfg = test_cfg
+
+        if loss_panoptic:
+            self.loss_panoptic = build_loss(loss_panoptic)
+        else:
+            self.loss_panoptic = None
+
+    @property
+    def with_loss(self):
+        """bool: whether the panoptic head contains loss function."""
+        return self.loss_panoptic is not None
+
+    @abstractmethod
+    def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs):
+        """Forward function during training."""
+
+    @abstractmethod
+    def simple_test(self,
+                    img_metas,
+                    det_labels,
+                    mask_preds,
+                    seg_preds,
+                    det_bboxes,
+                    cfg=None,
+                    **kwargs):
+        """Test without augmentation."""
diff --git a/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py b/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
new file mode 100755
index 0000000..06c1de2
--- /dev/null
+++ b/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.evaluation.panoptic_utils import INSTANCE_OFFSET
+from mmdet.models.builder import HEADS
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@HEADS.register_module()
+class HeuristicFusionHead(BasePanopticFusionHead):
+    """Fusion Head with Heuristic method."""
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(HeuristicFusionHead,
+              self).__init__(num_things_classes, num_stuff_classes, test_cfg,
+                             None, init_cfg, **kwargs)
+
+    def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs):
+        """HeuristicFusionHead has no training loss."""
+        return dict()
+
+    def _lay_masks(self, bboxes, labels, masks, overlap_thr=0.5):
+        """Lay instance masks to a result map.
+
+        Args:
+            bboxes: The bboxes results, (K, 4).
+            labels: The labels of bboxes, (K, ).
+            masks: The instance masks, (K, H, W).
+            overlap_thr: Threshold to determine whether two masks overlap.
+                default: 0.5.
+
+        Returns:
+            Tensor: The result map, (H, W).
+        """
+        num_insts = bboxes.shape[0]
+        id_map = torch.zeros(
+            masks.shape[-2:], device=bboxes.device, dtype=torch.long)
+        if num_insts == 0:
+            return id_map, labels
+
+        scores, bboxes = bboxes[:, -1], bboxes[:, :4]
+
+        # Sort by score to use heuristic fusion
+        order = torch.argsort(-scores)
+        bboxes = bboxes[order]
+        labels = labels[order]
+        segm_masks = masks[order]
+
+        instance_id = 1
+        left_labels = []
+        for idx in range(bboxes.shape[0]):
+            _cls = labels[idx]
+            _mask = segm_masks[idx]
+            instance_id_map = torch.ones_like(
+                _mask, dtype=torch.long) * instance_id
+            area = _mask.sum()
+            if area == 0:
+                continue
+
+            pasted = id_map > 0
+            intersect = (_mask * pasted).sum()
+            if (intersect / (area + 1e-5)) > overlap_thr:
+                continue
+
+            _part = _mask * (~pasted)
+            id_map = torch.where(_part, instance_id_map, id_map)
+            left_labels.append(_cls)
+            instance_id += 1
+
+        if len(left_labels) > 0:
+            instance_labels = torch.stack(left_labels)
+        else:
+            instance_labels = bboxes.new_zeros((0, ), dtype=torch.long)
+        assert instance_id == (len(instance_labels) + 1)
+        return id_map, instance_labels
+
+    def simple_test(self, det_bboxes, det_labels, mask_preds, seg_preds,
+                    **kwargs):
+        """Fuse the results of instance and semantic segmentations.
+
+        Args:
+            det_bboxes: The bboxes results, (K, 4).
+            det_labels: The labels of bboxes, (K,).
+            mask_preds: The masks results, (K, H, W).
+            seg_preds: The semantic segmentation results,
+                (K, num_stuff + 1, H, W).
+
+        Returns:
+            Tensor : The panoptic segmentation result, (H, W).
+        """
+        mask_preds = mask_preds >= self.test_cfg.mask_thr_binary
+        id_map, labels = self._lay_masks(det_bboxes, det_labels, mask_preds,
+                                         self.test_cfg.mask_overlap)
+
+        seg_results = seg_preds.argmax(dim=0)
+        seg_results = seg_results + self.num_things_classes
+
+        pan_results = seg_results
+        instance_id = 1
+        for idx in range(det_labels.shape[0]):
+            _mask = id_map == (idx + 1)
+            if _mask.sum() == 0:
+                continue
+            _cls = labels[idx]
+            # simply trust detection
+            segment_id = _cls + instance_id * INSTANCE_OFFSET
+            pan_results[_mask] = segment_id
+            instance_id += 1
+
+        ids, counts = torch.unique(
+            pan_results % INSTANCE_OFFSET, return_counts=True)
+        stuff_ids = ids[ids >= self.num_things_classes]
+        stuff_counts = counts[ids >= self.num_things_classes]
+        ignore_stuff_ids = stuff_ids[
+            stuff_counts < self.test_cfg.stuff_area_limit]
+
+        assert pan_results.ndim == 2
+        pan_results[(pan_results.unsqueeze(2) == ignore_stuff_ids.reshape(
+            1, 1, -1)).any(dim=2)] = self.num_classes
+
+        return pan_results
diff --git a/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py b/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
new file mode 100755
index 0000000..5b59ce4
--- /dev/null
+++ b/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmdet.core.evaluation.panoptic_utils import INSTANCE_OFFSET
+from mmdet.core.mask import mask2bbox
+from mmdet.models.builder import HEADS
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@HEADS.register_module()
+class MaskFormerFusionHead(BasePanopticFusionHead):
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(num_things_classes, num_stuff_classes, test_cfg,
+                         loss_panoptic, init_cfg, **kwargs)
+
+    def forward_train(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def panoptic_postprocess(self, mask_cls, mask_pred):
+        """Panoptic segmengation inference.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            Tensor: Panoptic segment result of shape \
+                (h, w), each element in Tensor means: \
+                ``segment_id = _cls + instance_id * INSTANCE_OFFSET``.
+        """
+        object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8)
+        iou_thr = self.test_cfg.get('iou_thr', 0.8)
+        filter_low_score = self.test_cfg.get('filter_low_score', False)
+
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+
+        keep = labels.ne(self.num_classes) & (scores > object_mask_thr)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.full((h, w),
+                                  self.num_classes,
+                                  dtype=torch.int32,
+                                  device=cur_masks.device)
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            pass
+        else:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            instance_id = 1
+            for k in range(cur_classes.shape[0]):
+                pred_class = int(cur_classes[k].item())
+                isthing = pred_class < self.num_things_classes
+                mask = cur_mask_ids == k
+                mask_area = mask.sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+
+                if filter_low_score:
+                    mask = mask & (cur_masks[k] >= 0.5)
+
+                if mask_area > 0 and original_area > 0:
+                    if mask_area / original_area < iou_thr:
+                        continue
+
+                    if not isthing:
+                        # different stuff regions of same class will be
+                        # merged here, and stuff share the instance_id 0.
+                        panoptic_seg[mask] = pred_class
+                    else:
+                        panoptic_seg[mask] = (
+                            pred_class + instance_id * INSTANCE_OFFSET)
+                        instance_id += 1
+
+        return panoptic_seg
+
+    def semantic_postprocess(self, mask_cls, mask_pred):
+        """Semantic segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            Tensor: Semantic segment result of shape \
+                (cls_out_channels, h, w).
+        """
+        # TODO add semantic segmentation result
+        raise NotImplementedError
+
+    def instance_postprocess(self, mask_cls, mask_pred):
+        """Instance segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            tuple[Tensor]: Instance segmentation results.
+
+            - labels_per_image (Tensor): Predicted labels,\
+                shape (n, ).
+            - bboxes (Tensor): Bboxes and scores with shape (n, 5) of \
+                positive region in binary mask, the last column is scores.
+            - mask_pred_binary (Tensor): Instance masks of \
+                shape (n, h, w).
+        """
+        max_per_image = self.test_cfg.get('max_per_image', 100)
+        num_queries = mask_cls.shape[0]
+        # shape (num_queries, num_class)
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        # shape (num_queries * num_class, )
+        labels = torch.arange(self.num_classes, device=mask_cls.device).\
+            unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+        scores_per_image, top_indices = scores.flatten(0, 1).topk(
+            max_per_image, sorted=False)
+        labels_per_image = labels[top_indices]
+
+        query_indices = top_indices // self.num_classes
+        mask_pred = mask_pred[query_indices]
+
+        # extract things
+        is_thing = labels_per_image < self.num_things_classes
+        scores_per_image = scores_per_image[is_thing]
+        labels_per_image = labels_per_image[is_thing]
+        mask_pred = mask_pred[is_thing]
+
+        mask_pred_binary = (mask_pred > 0).float()
+        mask_scores_per_image = (mask_pred.sigmoid() *
+                                 mask_pred_binary).flatten(1).sum(1) / (
+                                     mask_pred_binary.flatten(1).sum(1) + 1e-6)
+        det_scores = scores_per_image * mask_scores_per_image
+        mask_pred_binary = mask_pred_binary.bool()
+        bboxes = mask2bbox(mask_pred_binary)
+        bboxes = torch.cat([bboxes, det_scores[:, None]], dim=-1)
+
+        return labels_per_image, bboxes, mask_pred_binary
+
+    def simple_test(self,
+                    mask_cls_results,
+                    mask_pred_results,
+                    img_metas,
+                    rescale=False,
+                    **kwargs):
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should includes background.
+            mask_pred_results (Tensor): Mask logits, shape
+                (batch_size, num_queries, h, w).
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): If True, return boxes in
+                original image space. Default False.
+
+        Returns:
+            list[dict[str, Tensor | tuple[Tensor]]]: Semantic segmentation \
+                results and panoptic segmentation results for each \
+                image.
+
+            .. code-block:: none
+
+                [
+                    {
+                        'pan_results': Tensor, # shape = [h, w]
+                        'ins_results': tuple[Tensor],
+                        # semantic segmentation results are not supported yet
+                        'sem_results': Tensor
+                    },
+                    ...
+                ]
+        """
+        panoptic_on = self.test_cfg.get('panoptic_on', True)
+        semantic_on = self.test_cfg.get('semantic_on', False)
+        instance_on = self.test_cfg.get('instance_on', False)
+        assert not semantic_on, 'segmantic segmentation '\
+            'results are not supported yet.'
+
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            result = dict()
+            if panoptic_on:
+                pan_results = self.panoptic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['pan_results'] = pan_results
+
+            if instance_on:
+                ins_results = self.instance_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['ins_results'] = ins_results
+
+            if semantic_on:
+                sem_results = self.semantic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['sem_results'] = sem_results
+
+            results.append(result)
+
+        return results
diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py
new file mode 100755
index 0000000..e74ba89
--- /dev/null
+++ b/mmdet/models/utils/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .brick_wrappers import AdaptiveAvgPool2d, adaptive_avg_pool2d
+from .builder import build_linear_layer, build_transformer
+from .ckpt_convert import pvt_convert
+from .conv_upsample import ConvUpsample
+from .csp_layer import CSPLayer
+from .gaussian_target import gaussian_radius, gen_gaussian_target
+from .inverted_residual import InvertedResidual
+from .make_divisible import make_divisible
+from .misc import interpolate_as, sigmoid_geometric_mean
+from .normed_predictor import NormedConv2d, NormedLinear
+from .panoptic_gt_processing import preprocess_panoptic_gt
+from .point_sample import (get_uncertain_point_coords_with_randomness,
+                           get_uncertainty)
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding)
+from .res_layer import ResLayer, SimplifiedBasicBlock
+from .se_layer import DyReLU, SELayer
+from .transformer import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DynamicConv, PatchEmbed, Transformer, nchw_to_nlc,
+                          nlc_to_nchw)
+
+__all__ = [
+    'ResLayer', 'gaussian_radius', 'gen_gaussian_target',
+    'DetrTransformerDecoderLayer', 'DetrTransformerDecoder', 'Transformer',
+    'build_transformer', 'build_linear_layer', 'SinePositionalEncoding',
+    'LearnedPositionalEncoding', 'DynamicConv', 'SimplifiedBasicBlock',
+    'NormedLinear', 'NormedConv2d', 'make_divisible', 'InvertedResidual',
+    'SELayer', 'interpolate_as', 'ConvUpsample', 'CSPLayer',
+    'adaptive_avg_pool2d', 'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc',
+    'nlc_to_nchw', 'pvt_convert', 'sigmoid_geometric_mean',
+    'preprocess_panoptic_gt', 'DyReLU',
+    'get_uncertain_point_coords_with_randomness', 'get_uncertainty'
+]
diff --git a/mmdet/models/utils/brick_wrappers.py b/mmdet/models/utils/brick_wrappers.py
new file mode 100755
index 0000000..fa0279a
--- /dev/null
+++ b/mmdet/models/utils/brick_wrappers.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks.wrappers import NewEmptyTensorOp, obsolete_torch_version
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def adaptive_avg_pool2d(input, output_size):
+    """Handle empty batch dimension to adaptive_avg_pool2d.
+
+    Args:
+        input (tensor): 4D tensor.
+        output_size (int, tuple[int,int]): the target output size.
+    """
+    if input.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+        output_size = [*input.shape[:2], *output_size]
+        empty = NewEmptyTensorOp.apply(input, output_size)
+        return empty
+    else:
+        return F.adaptive_avg_pool2d(input, output_size)
+
+
+class AdaptiveAvgPool2d(nn.AdaptiveAvgPool2d):
+    """Handle empty batch dimension to AdaptiveAvgPool2d."""
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            output_size = self.output_size
+            if isinstance(output_size, int):
+                output_size = [output_size, output_size]
+            else:
+                output_size = [
+                    v if v is not None else d
+                    for v, d in zip(output_size,
+                                    x.size()[-2:])
+                ]
+            output_size = [*x.shape[:2], *output_size]
+            empty = NewEmptyTensorOp.apply(x, output_size)
+            return empty
+
+        return super().forward(x)
diff --git a/mmdet/models/utils/builder.py b/mmdet/models/utils/builder.py
new file mode 100755
index 0000000..20fe7a6
--- /dev/null
+++ b/mmdet/models/utils/builder.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.utils import Registry, build_from_cfg
+
+TRANSFORMER = Registry('Transformer')
+LINEAR_LAYERS = Registry('linear layers')
+
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
+
+
+LINEAR_LAYERS.register_module('Linear', module=nn.Linear)
+
+
+def build_linear_layer(cfg, *args, **kwargs):
+    """Build linear layer.
+    Args:
+        cfg (None or dict): The linear layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an linear layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding linear layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding linear layer.
+    Returns:
+        nn.Module: Created linear layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Linear')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in LINEAR_LAYERS:
+        raise KeyError(f'Unrecognized linear type {layer_type}')
+    else:
+        linear_layer = LINEAR_LAYERS.get(layer_type)
+
+    layer = linear_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/mmdet/models/utils/ckpt_convert.py b/mmdet/models/utils/ckpt_convert.py
new file mode 100755
index 0000000..4d660c4
--- /dev/null
+++ b/mmdet/models/utils/ckpt_convert.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# This script consists of several convert functions which
+# can modify the weights of model in original repo to be
+# pre-trained weights.
+
+from collections import OrderedDict
+
+import torch
+
+
+def pvt_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    use_abs_pos_embed = False
+    use_conv_ffn = False
+    for k in ckpt.keys():
+        if k.startswith('pos_embed'):
+            use_abs_pos_embed = True
+        if k.find('dwconv') >= 0:
+            use_conv_ffn = True
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm.'):
+            continue
+        if k.startswith('cls_token'):
+            continue
+        if k.startswith('pos_embed'):
+            stage_i = int(k.replace('pos_embed', ''))
+            new_k = k.replace(f'pos_embed{stage_i}',
+                              f'layers.{stage_i - 1}.1.0.pos_embed')
+            if stage_i == 4 and v.size(1) == 50:  # 1 (cls token) + 7 * 7
+                new_v = v[:, 1:, :]  # remove cls token
+            else:
+                new_v = v
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}',
+                              f'layers.{stage_i - 1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            layer_i = int(k.split('.')[1])
+            new_layer_i = layer_i + use_abs_pos_embed
+            new_k = k.replace(f'block{stage_i}.{layer_i}',
+                              f'layers.{stage_i - 1}.1.{new_layer_i}')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                if use_conv_ffn:
+                    new_k = new_k.replace('fc2.', '4.')
+                else:
+                    new_k = new_k.replace('fc2.', '3.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        elif k.startswith('norm'):
+            stage_i = int(k[4])
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def swin_converter(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt['backbone.' + new_k] = new_v
+
+    return new_ckpt
diff --git a/mmdet/models/utils/conv_upsample.py b/mmdet/models/utils/conv_upsample.py
new file mode 100755
index 0000000..bb5ba76
--- /dev/null
+++ b/mmdet/models/utils/conv_upsample.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, ModuleList
+
+
+class ConvUpsample(BaseModule):
+    """ConvUpsample performs 2x upsampling after Conv.
+
+    There are several `ConvModule` layers. In the first few layers, upsampling
+    will be applied after each layer of convolution. The number of upsampling
+    must be no more than the number of ConvModule layers.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        inner_channels (int): Number of channels produced by the convolution.
+        num_layers (int): Number of convolution layers.
+        num_upsample (int | optional): Number of upsampling layer. Must be no
+            more than num_layers. Upsampling will be applied after the first
+            ``num_upsample`` layers of convolution. Default: ``num_layers``.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict): Config dict for initialization. Default: None.
+        kwargs (key word augments): Other augments used in ConvModule.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 inner_channels,
+                 num_layers=1,
+                 num_upsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(ConvUpsample, self).__init__(init_cfg)
+        if num_upsample is None:
+            num_upsample = num_layers
+        assert num_upsample <= num_layers, \
+            f'num_upsample({num_upsample})must be no more than ' \
+            f'num_layers({num_layers})'
+        self.num_layers = num_layers
+        self.num_upsample = num_upsample
+        self.conv = ModuleList()
+        for i in range(num_layers):
+            self.conv.append(
+                ConvModule(
+                    in_channels,
+                    inner_channels,
+                    3,
+                    padding=1,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = inner_channels
+
+    def forward(self, x):
+        num_upsample = self.num_upsample
+        for i in range(self.num_layers):
+            x = self.conv[i](x)
+            if num_upsample > 0:
+                num_upsample -= 1
+                x = F.interpolate(
+                    x, scale_factor=2, mode='bilinear', align_corners=False)
+        return x
diff --git a/mmdet/models/utils/csp_layer.py b/mmdet/models/utils/csp_layer.py
new file mode 100755
index 0000000..5760b01
--- /dev/null
+++ b/mmdet/models/utils/csp_layer.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.runner import BaseModule
+
+
+class DarknetBottleneck(BaseModule):
+    """The basic bottleneck block used in Darknet.
+
+    Each ResBlock consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and LeakyReLU.
+    The first convLayer has filter size of 1x1 and the second one has the
+    filter size of 3x3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (int): The kernel size of the convolution. Default: 0.5
+        add_identity (bool): Whether to add identity to the out.
+            Default: True
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=0.5,
+                 add_identity=True,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = conv(
+            hidden_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPLayer(BaseModule):
+    """Cross Stage Partial Layer.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Default: 0.5
+        num_blocks (int): Number of blocks. Default: 1
+        add_identity (bool): Whether to add identity in blocks.
+            Default: True
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expand_ratio=0.5,
+                 num_blocks=1,
+                 add_identity=True,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        mid_channels = int(out_channels * expand_ratio)
+        self.main_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.short_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.final_conv = ConvModule(
+            2 * mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.Sequential(*[
+            DarknetBottleneck(
+                mid_channels,
+                mid_channels,
+                1.0,
+                add_identity,
+                use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg) for _ in range(num_blocks)
+        ])
+
+    def forward(self, x):
+        x_short = self.short_conv(x)
+
+        x_main = self.main_conv(x)
+        x_main = self.blocks(x_main)
+
+        x_final = torch.cat((x_main, x_short), dim=1)
+        return self.final_conv(x_final)
diff --git a/mmdet/models/utils/gaussian_target.py b/mmdet/models/utils/gaussian_target.py
new file mode 100755
index 0000000..9997d3b
--- /dev/null
+++ b/mmdet/models/utils/gaussian_target.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from math import sqrt
+
+import torch
+import torch.nn.functional as F
+
+
+def gaussian2D(radius, sigma=1, dtype=torch.float32, device='cpu'):
+    """Generate 2D gaussian kernel.
+
+    Args:
+        radius (int): Radius of gaussian kernel.
+        sigma (int): Sigma of gaussian function. Default: 1.
+        dtype (torch.dtype): Dtype of gaussian tensor. Default: torch.float32.
+        device (str): Device of gaussian tensor. Default: 'cpu'.
+
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius + 1) * (2 * radius + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(-1, 1)
+
+    h = (-(x * x + y * y) / (2 * sigma * sigma)).exp()
+
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def gen_gaussian_target(heatmap, center, radius, k=1):
+    """Generate 2D gaussian heatmap.
+
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius (int): Radius of gaussian kernel.
+        k (int): Coefficient of gaussian kernel. Default: 1.
+
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter = 2 * radius + 1
+    gaussian_kernel = gaussian2D(
+        radius, sigma=diameter / 6, dtype=heatmap.dtype, device=heatmap.device)
+
+    x, y = center
+
+    height, width = heatmap.shape[:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius - top:radius + bottom,
+                                      radius - left:radius + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+
+    return out_heatmap
+
+
+def gaussian_radius(det_size, min_overlap):
+    r"""Generate 2D gaussian radius.
+
+    This function is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet-Lite/blob/master/core/sample/
+    utils.py#L65>`_.
+
+    Given ``min_overlap``, radius could computed by a quadratic equation
+    according to Vieta's formulas.
+
+    There are 3 cases for computing gaussian radius, details are following:
+
+    - Explanation of figure: ``lt`` and ``br`` indicates the left-top and
+      bottom-right corner of ground truth box. ``x`` indicates the
+      generated corner at the limited position when ``radius=r``.
+
+    - Case1: one corner is inside the gt box and the other is outside.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x----------+--+
+        |  |          |  |
+        |  |          |  |    height
+        |  | overlap  |  |
+        |  |          |  |
+        |  |          |  |      v
+        +--+---------br--+      -
+           |          |  |
+           +----------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-r)*(h-r)}{w*h+(w+h)r-r^2} \ge {iou} \quad\Rightarrow\quad
+        {r^2-(w+h)r+\cfrac{1-iou}{1+iou}*w*h} \ge 0 \\
+        {a} = 1,\quad{b} = {-(w+h)},\quad{c} = {\cfrac{1-iou}{1+iou}*w*h} \\
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case2: both two corners are inside the gt box.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x-------+  |
+        |  |       |  |
+        |  |overlap|  |       height
+        |  |       |  |
+        |  +-------x--+
+        |          |  |         v
+        +----------+-br         -
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-2*r)*(h-2*r)}{w*h} \ge {iou} \quad\Rightarrow\quad
+        {4r^2-2(w+h)r+(1-iou)*w*h} \ge 0 \\
+        {a} = 4,\quad {b} = {-2(w+h)},\quad {c} = {(1-iou)*w*h} \\
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case3: both two corners are outside the gt box.
+
+    .. code:: text
+
+           |<   width   >|
+
+        x--+----------------+
+        |  |                |
+        +-lt-------------+  |   -
+        |  |             |  |   ^
+        |  |             |  |
+        |  |   overlap   |  | height
+        |  |             |  |
+        |  |             |  |   v
+        |  +------------br--+   -
+        |                |  |
+        +----------------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{w*h}{(w+2*r)*(h+2*r)} \ge {iou} \quad\Rightarrow\quad
+        {4*iou*r^2+2*iou*(w+h)r+(iou-1)*w*h} \le 0 \\
+        {a} = {4*iou},\quad {b} = {2*iou*(w+h)},\quad {c} = {(iou-1)*w*h} \\
+        {r} \le \cfrac{-b+\sqrt{b^2-4*a*c}}{2*a}
+
+    Args:
+        det_size (list[int]): Shape of object.
+        min_overlap (float): Min IoU with ground truth for boxes generated by
+            keypoints inside the gaussian kernel.
+
+    Returns:
+        radius (int): Radius of gaussian kernel.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 - sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 - sq2) / (2 * a2)
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / (2 * a3)
+    return min(r1, r2, r3)
+
+
+def get_local_maximum(heat, kernel=3):
+    """Extract local maximum pixel with given kernel.
+
+    Args:
+        heat (Tensor): Target heatmap.
+        kernel (int): Kernel size of max pooling. Default: 3.
+
+    Returns:
+        heat (Tensor): A heatmap where local maximum pixels maintain its
+            own value and other positions are 0.
+    """
+    pad = (kernel - 1) // 2
+    hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
+    keep = (hmax == heat).float()
+    return heat * keep
+
+
+def get_topk_from_heatmap(scores, k=20):
+    """Get top k positions from heatmap.
+
+    Args:
+        scores (Tensor): Target heatmap with shape
+            [batch, num_classes, height, width].
+        k (int): Target number. Default: 20.
+
+    Returns:
+        tuple[torch.Tensor]: Scores, indexes, categories and coords of
+            topk keypoint. Containing following Tensors:
+
+        - topk_scores (Tensor): Max scores of each topk keypoint.
+        - topk_inds (Tensor): Indexes of each topk keypoint.
+        - topk_clses (Tensor): Categories of each topk keypoint.
+        - topk_ys (Tensor): Y-coord of each topk keypoint.
+        - topk_xs (Tensor): X-coord of each topk keypoint.
+    """
+    batch, _, height, width = scores.size()
+    topk_scores, topk_inds = torch.topk(scores.view(batch, -1), k)
+    topk_clses = topk_inds // (height * width)
+    topk_inds = topk_inds % (height * width)
+    topk_ys = topk_inds // width
+    topk_xs = (topk_inds % width).int().float()
+    return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs
+
+
+def gather_feat(feat, ind, mask=None):
+    """Gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+        mask (Tensor | None): Mask of feature map. Default: None.
+
+    Returns:
+        feat (Tensor): Gathered feature.
+    """
+    dim = feat.size(2)
+    ind = ind.unsqueeze(2).repeat(1, 1, dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+
+def transpose_and_gather_feat(feat, ind):
+    """Transpose and gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+
+    Returns:
+        feat (Tensor): Transposed and gathered feature.
+    """
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = gather_feat(feat, ind)
+    return feat
diff --git a/mmdet/models/utils/inverted_residual.py b/mmdet/models/utils/inverted_residual.py
new file mode 100755
index 0000000..1f241ae
--- /dev/null
+++ b/mmdet/models/utils/inverted_residual.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import DropPath
+from mmcv.runner import BaseModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(BaseModule):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None):
+        super(InvertedResidual, self).__init__(init_cfg)
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/mmdet/models/utils/make_divisible.py b/mmdet/models/utils/make_divisible.py
new file mode 100755
index 0000000..ed42c2e
--- /dev/null
+++ b/mmdet/models/utils/make_divisible.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/mmdet/models/utils/misc.py b/mmdet/models/utils/misc.py
new file mode 100755
index 0000000..8f9be9a
--- /dev/null
+++ b/mmdet/models/utils/misc.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.autograd import Function
+from torch.nn import functional as F
+
+
+class SigmoidGeometricMean(Function):
+    """Forward and backward function of geometric mean of two sigmoid
+    functions.
+
+    This implementation with analytical gradient function substitutes
+    the autograd function of (x.sigmoid() * y.sigmoid()).sqrt(). The
+    original implementation incurs none during gradient backprapagation
+    if both x and y are very small values.
+    """
+
+    @staticmethod
+    def forward(ctx, x, y):
+        x_sigmoid = x.sigmoid()
+        y_sigmoid = y.sigmoid()
+        z = (x_sigmoid * y_sigmoid).sqrt()
+        ctx.save_for_backward(x_sigmoid, y_sigmoid, z)
+        return z
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x_sigmoid, y_sigmoid, z = ctx.saved_tensors
+        grad_x = grad_output * z * (1 - x_sigmoid) / 2
+        grad_y = grad_output * z * (1 - y_sigmoid) / 2
+        return grad_x, grad_y
+
+
+sigmoid_geometric_mean = SigmoidGeometricMean.apply
+
+
+def interpolate_as(source, target, mode='bilinear', align_corners=False):
+    """Interpolate the `source` to the shape of the `target`.
+
+    The `source` must be a Tensor, but the `target` can be a Tensor or a
+    np.ndarray with the shape (..., target_h, target_w).
+
+    Args:
+        source (Tensor): A 3D/4D Tensor with the shape (N, H, W) or
+            (N, C, H, W).
+        target (Tensor | np.ndarray): The interpolation target with the shape
+            (..., target_h, target_w).
+        mode (str): Algorithm used for interpolation. The options are the
+            same as those in F.interpolate(). Default: ``'bilinear'``.
+        align_corners (bool): The same as the argument in F.interpolate().
+
+    Returns:
+        Tensor: The interpolated source Tensor.
+    """
+    assert len(target.shape) >= 2
+
+    def _interpolate_as(source, target, mode='bilinear', align_corners=False):
+        """Interpolate the `source` (4D) to the shape of the `target`."""
+        target_h, target_w = target.shape[-2:]
+        source_h, source_w = source.shape[-2:]
+        if target_h != source_h or target_w != source_w:
+            source = F.interpolate(
+                source,
+                size=(target_h, target_w),
+                mode=mode,
+                align_corners=align_corners)
+        return source
+
+    if len(source.shape) == 3:
+        source = source[:, None, :, :]
+        source = _interpolate_as(source, target, mode, align_corners)
+        return source[:, 0, :, :]
+    else:
+        return _interpolate_as(source, target, mode, align_corners)
diff --git a/mmdet/models/utils/normed_predictor.py b/mmdet/models/utils/normed_predictor.py
new file mode 100755
index 0000000..f0eeef7
--- /dev/null
+++ b/mmdet/models/utils/normed_predictor.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import CONV_LAYERS
+
+from .builder import LINEAR_LAYERS
+
+
+@LINEAR_LAYERS.register_module(name='NormedLinear')
+class NormedLinear(nn.Linear):
+    """Normalized Linear Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Default to 20.
+        power (int, optional): Power term. Default to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Default to 1e-6.
+    """
+
+    def __init__(self, *args, tempearture=20, power=1.0, eps=1e-6, **kwargs):
+        super(NormedLinear, self).__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.eps = eps
+        self.init_weights()
+
+    def init_weights(self):
+        nn.init.normal_(self.weight, mean=0, std=0.01)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x):
+        weight_ = self.weight / (
+            self.weight.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        return F.linear(x_, weight_, self.bias)
+
+
+@CONV_LAYERS.register_module(name='NormedConv2d')
+class NormedConv2d(nn.Conv2d):
+    """Normalized Conv2d Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Default to 20.
+        power (int, optional): Power term. Default to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Default to 1e-6.
+        norm_over_kernel (bool, optional): Normalize over kernel.
+             Default to False.
+    """
+
+    def __init__(self,
+                 *args,
+                 tempearture=20,
+                 power=1.0,
+                 eps=1e-6,
+                 norm_over_kernel=False,
+                 **kwargs):
+        super(NormedConv2d, self).__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.norm_over_kernel = norm_over_kernel
+        self.eps = eps
+
+    def forward(self, x):
+        if not self.norm_over_kernel:
+            weight_ = self.weight / (
+                self.weight.norm(dim=1, keepdim=True).pow(self.power) +
+                self.eps)
+        else:
+            weight_ = self.weight / (
+                self.weight.view(self.weight.size(0), -1).norm(
+                    dim=1, keepdim=True).pow(self.power)[..., None, None] +
+                self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        if hasattr(self, 'conv2d_forward'):
+            x_ = self.conv2d_forward(x_, weight_)
+        else:
+            if torch.__version__ >= '1.8':
+                x_ = self._conv_forward(x_, weight_, self.bias)
+            else:
+                x_ = self._conv_forward(x_, weight_)
+        return x_
diff --git a/mmdet/models/utils/panoptic_gt_processing.py b/mmdet/models/utils/panoptic_gt_processing.py
new file mode 100755
index 0000000..7685ac9
--- /dev/null
+++ b/mmdet/models/utils/panoptic_gt_processing.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things,
+                           num_stuff, img_metas):
+    """Preprocess the ground truth for a image.
+
+    Args:
+        gt_labels (Tensor): Ground truth labels of each bbox,
+            with shape (num_gts, ).
+        gt_masks (BitmapMasks): Ground truth masks of each instances
+            of a image, shape (num_gts, h, w).
+        gt_semantic_seg (Tensor | None): Ground truth of semantic
+            segmentation with the shape (1, h, w).
+            [0, num_thing_class - 1] means things,
+            [num_thing_class, num_class-1] means stuff,
+            255 means VOID. It's None when training instance segmentation.
+        img_metas (dict): List of image meta information.
+
+    Returns:
+        tuple: a tuple containing the following targets.
+
+            - labels (Tensor): Ground truth class indices for a
+                image, with shape (n, ), n is the sum of number
+                of stuff type and number of instance in a image.
+            - masks (Tensor): Ground truth mask for a image, with
+                shape (n, h, w). Contains stuff and things when training
+                panoptic segmentation, and things only when training
+                instance segmentation.
+    """
+    num_classes = num_things + num_stuff
+
+    things_masks = gt_masks.pad(img_metas['pad_shape'][:2], pad_val=0)\
+        .to_tensor(dtype=torch.bool, device=gt_labels.device)
+
+    if gt_semantic_seg is None:
+        masks = things_masks.long()
+        return gt_labels, masks
+
+    things_labels = gt_labels
+    gt_semantic_seg = gt_semantic_seg.squeeze(0)
+
+    semantic_labels = torch.unique(
+        gt_semantic_seg,
+        sorted=False,
+        return_inverse=False,
+        return_counts=False)
+    stuff_masks_list = []
+    stuff_labels_list = []
+    for label in semantic_labels:
+        if label < num_things or label >= num_classes:
+            continue
+        stuff_mask = gt_semantic_seg == label
+        stuff_masks_list.append(stuff_mask)
+        stuff_labels_list.append(label)
+
+    if len(stuff_masks_list) > 0:
+        stuff_masks = torch.stack(stuff_masks_list, dim=0)
+        stuff_labels = torch.stack(stuff_labels_list, dim=0)
+        labels = torch.cat([things_labels, stuff_labels], dim=0)
+        masks = torch.cat([things_masks, stuff_masks], dim=0)
+    else:
+        labels = things_labels
+        masks = things_masks
+
+    masks = masks.long()
+    return labels, masks
diff --git a/mmdet/models/utils/point_sample.py b/mmdet/models/utils/point_sample.py
new file mode 100755
index 0000000..c2c3cf9
--- /dev/null
+++ b/mmdet/models/utils/point_sample.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+
+
+def get_uncertainty(mask_pred, labels):
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_pred' for the foreground class in `classes`.
+
+    Args:
+        mask_pred (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (list[Tensor]): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_pred.shape[1] == 1:
+        gt_class_logits = mask_pred.clone()
+    else:
+        inds = torch.arange(mask_pred.shape[0], device=mask_pred.device)
+        gt_class_logits = mask_pred[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(mask_pred, labels, num_points,
+                                               oversample_ratio,
+                                               importance_sample_ratio):
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (list): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (int): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_pred.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_pred.device)
+    point_logits = point_sample(mask_pred, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_pred.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_pred.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/mmdet/models/utils/positional_encoding.py b/mmdet/models/utils/positional_encoding.py
new file mode 100755
index 0000000..dd29cd6
--- /dev/null
+++ b/mmdet/models/utils/positional_encoding.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmcv.runner import BaseModule
+
+
+@POSITIONAL_ENCODING.register_module()
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None):
+        super(SinePositionalEncoding, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@POSITIONAL_ENCODING.register_module()
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_feats,
+                 row_num_embed=50,
+                 col_num_embed=50,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super(LearnedPositionalEncoding, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
diff --git a/mmdet/models/utils/res_layer.py b/mmdet/models/utils/res_layer.py
new file mode 100755
index 0000000..5c3e89f
--- /dev/null
+++ b/mmdet/models/utils/res_layer.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule, Sequential
+from torch import nn as nn
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=planes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=inplanes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super(ResLayer, self).__init__(*layers)
+
+
+class SimplifiedBasicBlock(BaseModule):
+    """Simplified version of original basic residual block. This is used in
+    `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    - Norm layer is now optional
+    - Last ReLU in forward function is removed
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_fg=None):
+        super(SimplifiedBasicBlock, self).__init__(init_fg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert not with_cp, 'Not implemented yet.'
+        self.with_norm = norm_cfg is not None
+        with_bias = True if norm_cfg is None else False
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=with_bias)
+        if self.with_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, planes, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=with_bias)
+        if self.with_norm:
+            self.norm2_name, norm2 = build_norm_layer(
+                norm_cfg, planes, postfix=2)
+            self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name) if self.with_norm else None
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name) if self.with_norm else None
+
+    def forward(self, x):
+        """Forward function."""
+
+        identity = x
+
+        out = self.conv1(x)
+        if self.with_norm:
+            out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        if self.with_norm:
+            out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+
+        return out
diff --git a/mmdet/models/utils/se_layer.py b/mmdet/models/utils/se_layer.py
new file mode 100755
index 0000000..a249210
--- /dev/null
+++ b/mmdet/models/utils/se_layer.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+
+class SELayer(BaseModule):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid'))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid')),
+                 init_cfg=None):
+        super(SELayer, self).__init__(init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class DyReLU(BaseModule):
+    """Dynamic ReLU (DyReLU) module.
+
+    See `Dynamic ReLU <https://arxiv.org/abs/2003.10027>`_ for details.
+    Current implementation is specialized for task-aware attention in DyHead.
+    HSigmoid arguments in default act_cfg follow DyHead official code.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        channels (int): The input (and output) channels of DyReLU module.
+        ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module,
+            the intermediate channel will be ``int(channels/ratio)``.
+            Default: 4.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=4,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'),
+                          dict(type='HSigmoid', bias=3.0, divisor=6.0)),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        self.expansion = 4  # for a1, b1, a2, b2
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        """Forward function."""
+        coeffs = self.global_avgpool(x)
+        coeffs = self.conv1(coeffs)
+        coeffs = self.conv2(coeffs) - 0.5  # value range: [-0.5, 0.5]
+        a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1)
+        a1 = a1 * 2.0 + 1.0  # [-1.0, 1.0] + 1.0
+        a2 = a2 * 2.0  # [-1.0, 1.0]
+        out = torch.max(x * a1 + b1, x * a2 + b2)
+        return out
diff --git a/mmdet/models/utils/transformer.py b/mmdet/models/utils/transformer.py
new file mode 100755
index 0000000..3c390c8
--- /dev/null
+++ b/mmdet/models/utils/transformer.py
@@ -0,0 +1,1167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (build_activation_layer, build_conv_layer,
+                      build_norm_layer, xavier_init)
+from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import to_2tuple
+from torch.nn.init import normal_
+
+from mmdet.models.utils.builder import TRANSFORMER
+
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+
+except ImportError:
+    warnings.warn(
+        '`MultiScaleDeformableAttention` in MMCV has been moved to '
+        '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV')
+    from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len does not match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels=3,
+        embed_dims=768,
+        conv_type='Conv2d',
+        kernel_size=16,
+        stride=16,
+        padding='corner',
+        dilation=1,
+        bias=True,
+        norm_cfg=None,
+        input_size=None,
+        init_cfg=None,
+    ):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified..
+            Default: True.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER.register_module()
+class DetrTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 **kwargs):
+        super(DetrTransformerDecoderLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        super(DetrTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(
+                post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f'Use prenorm in ' \
+                                      f'{self.__class__.__name__},' \
+                                      f'Please specify post_norm_cfg'
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self,
+                 *args,
+                 post_norm_cfg=dict(type='LN'),
+                 return_intermediate=False,
+                 **kwargs):
+
+        super(DetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg,
+                                              self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
+
+
+@TRANSFORMER.register_module()
+class Transformer(BaseModule):
+    """Implements the DETR transformer.
+
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+        super(Transformer, self).__init__(init_cfg=init_cfg)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.encoder.embed_dims
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed):
+        """Forward function for `Transformer`.
+
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, c, h, w = x.shape
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        x = x.view(bs, c, -1).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(
+            1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, h, w] -> [bs, h*w]
+        memory = self.encoder(
+            query=x,
+            key=None,
+            value=None,
+            query_pos=pos_embed,
+            query_key_padding_mask=mask)
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask)
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
+        return out_dec, memory
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DeformableDetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+
+        super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                reg_branches=None,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] * \
+                    torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * \
+                    valid_ratios[:, None]
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[
+                        ..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@TRANSFORMER.register_module()
+class DeformableDetrTransformer(Transformer):
+    """Implements the DeformableDETR transformer.
+
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 as_two_stage=False,
+                 num_feature_levels=4,
+                 two_stage_num_proposals=300,
+                 **kwargs):
+        super(DeformableDetrTransformer, self).__init__(**kwargs)
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.embed_dims = self.encoder.embed_dims
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
+            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                       self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if not self.as_two_stage:
+            xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        normal_(self.level_embeds)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
+                                     spatial_shapes):
+        """Generate proposals from encoded memory.
+
+        Args:
+            memory (Tensor) : The output of encoder,
+                has shape (bs, num_key, embed_dim).  num_key is
+                equal the number of points on feature map from
+                all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+
+                - output_memory (Tensor): The input of decoder,  \
+                    has shape (bs, num_key, embed_dim).  num_key is \
+                    equal the number of points on feature map from \
+                    all levels.
+                - output_proposals (Tensor): The normalized proposal \
+                    after a inverse sigmoid, has shape \
+                    (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view(
+                N, H, W, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(
+                    0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(
+                    0, W - 1, W, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1),
+                               valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(N, -1, 4)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) &
+                                  (output_proposals < 0.99)).all(
+                                      -1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """Get the reference points used in decoder.
+
+        Args:
+            spatial_shapes (Tensor): The shape of all
+                feature maps, has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            device (obj:`device`): The device where
+                reference_points should be.
+
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            #  TODO  check this 0.5
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all  level."""
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self,
+                               proposals,
+                               num_pos_feats=128,
+                               temperature=10000):
+        """Get the position embedding of proposal."""
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+                          dim=4).flatten(2)
+        return pos
+
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                reg_branches=None,
+                cls_branches=None,
+                **kwargs):
+        """Forward function for `Transformer`.
+
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, embed_dims, h, w].
+            mlvl_masks (list(Tensor)): The key_padding_mask from
+                different level used for encoder and decoder,
+                each element has shape  [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                 [bs, embed_dims, h, w].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when
+                `with_box_refine` is True. Default to None.
+            cls_branches (obj:`nn.ModuleList`): Classification heads
+                for feature maps from each decoder layer. Only would
+                 be passed when `as_two_stage`
+                 is True. Default to None.
+
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+        assert self.as_two_stage or query_embed is not None
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = \
+            self.get_reference_points(spatial_shapes,
+                                      valid_ratios,
+                                      device=feat.device)
+
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
+            1, 0, 2)  # (H*W, bs, embed_dims)
+        memory = self.encoder(
+            query=feat_flatten,
+            key=None,
+            value=None,
+            query_pos=lvl_pos_embed_flatten,
+            query_key_padding_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            **kwargs)
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, mask_flatten, spatial_shapes)
+            enc_outputs_class = cls_branches[self.decoder.num_layers](
+                output_memory)
+            enc_outputs_coord_unact = \
+                reg_branches[
+                    self.decoder.num_layers](output_memory) + output_proposals
+
+            topk = self.two_stage_num_proposals
+            # We only use the first channel in enc_outputs_class as foreground,
+            # the other (num_classes - 1) channels are actually not used.
+            # Its targets are set to be 0s, which indicates the first
+            # class (foreground) because we use [0, num_classes - 1] to
+            # indicate class labels, background class is indicated by
+            # num_classes (similar convention in RPN).
+            # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa
+            # This follows the official implementation of Deformable DETR.
+            topk_proposals = torch.topk(
+                enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_unact = torch.gather(
+                enc_outputs_coord_unact, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(
+                self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+            query = query.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_pos).sigmoid()
+            init_reference_out = reference_points
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            **kwargs)
+
+        inter_references_out = inter_references
+        if self.as_two_stage:
+            return inter_states, init_reference_out,\
+                inter_references_out, enc_outputs_class,\
+                enc_outputs_coord_unact
+        return inter_states, init_reference_out, \
+            inter_references_out, None, None
+
+
+@TRANSFORMER.register_module()
+class DynamicConv(BaseModule):
+    """Implements Dynamic Convolution.
+
+    This module generate parameters for each sample and
+    use bmm to implement 1*1 convolution. Code is modified
+    from the `official github repo <https://github.com/PeizeSun/
+    SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L258>`_ .
+
+    Args:
+        in_channels (int): The input feature channel.
+            Defaults to 256.
+        feat_channels (int): The inner feature channel.
+            Defaults to 64.
+        out_channels (int, optional): The output feature channel.
+            When not specified, it will be set to `in_channels`
+            by default
+        input_feat_shape (int): The shape of input feature.
+            Defaults to 7.
+        with_proj (bool): Project two-dimentional feature to
+            one-dimentional feature. Default to True.
+        act_cfg (dict): The activation config for DynamicConv.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=256,
+                 feat_channels=64,
+                 out_channels=None,
+                 input_feat_shape=7,
+                 with_proj=True,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super(DynamicConv, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.input_feat_shape = input_feat_shape
+        self.with_proj = with_proj
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.in_channels * self.feat_channels
+        self.num_params_out = self.out_channels * self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        num_output = self.out_channels * input_feat_shape**2
+        if self.with_proj:
+            self.fc_layer = nn.Linear(num_output, self.out_channels)
+            self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, param_feature, input_feature):
+        """Forward function for `DynamicConv`.
+
+        Args:
+            param_feature (Tensor): The feature can be used
+                to generate the parameter, has shape
+                (num_all_proposals, in_channels).
+            input_feature (Tensor): Feature that
+                interact with parameters, has shape
+                (num_all_proposals, in_channels, H, W).
+
+        Returns:
+            Tensor: The output feature has shape
+            (num_all_proposals, out_channels).
+        """
+        input_feature = input_feature.flatten(2).permute(2, 0, 1)
+
+        input_feature = input_feature.permute(1, 0, 2)
+        parameters = self.dynamic_layer(param_feature)
+
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.in_channels, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels, self.out_channels)
+
+        # input_feature has shape (num_all_proposals, H*W, in_channels)
+        # param_in has shape (num_all_proposals, in_channels, feat_channels)
+        # feature has shape (num_all_proposals, H*W, feat_channels)
+        features = torch.bmm(input_feature, param_in)
+        features = self.norm_in(features)
+        features = self.activation(features)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = torch.bmm(features, param_out)
+        features = self.norm_out(features)
+        features = self.activation(features)
+
+        if self.with_proj:
+            features = features.flatten(1)
+            features = self.fc_layer(features)
+            features = self.fc_norm(features)
+            features = self.activation(features)
+
+        return features
diff --git a/mmdet/utils/__init__.py b/mmdet/utils/__init__.py
new file mode 100755
index 0000000..b5a2b6b
--- /dev/null
+++ b/mmdet/utils/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ascend_util import (batch_images_to_levels,
+                          get_max_num_gt_division_factor, masked_fill)
+from .collect_env import collect_env
+from .compat_config import compat_cfg
+from .logger import get_caller_name, get_root_logger, log_img_scale
+from .memory import AvoidCUDAOOM, AvoidOOM
+from .misc import find_latest_checkpoint, update_data_root
+from .replace_cfg_vals import replace_cfg_vals
+from .rfnext import rfnext_init_model
+from .setup_env import setup_multi_processes
+from .split_batch import split_batch
+from .util_distribution import build_ddp, build_dp, get_device
+
+__all__ = [
+    'get_root_logger', 'collect_env', 'find_latest_checkpoint',
+    'update_data_root', 'setup_multi_processes', 'get_caller_name',
+    'log_img_scale', 'compat_cfg', 'split_batch', 'build_ddp', 'build_dp',
+    'get_device', 'replace_cfg_vals', 'AvoidOOM', 'AvoidCUDAOOM',
+    'get_max_num_gt_division_factor', 'masked_fill', 'batch_images_to_levels',
+    'rfnext_init_model'
+]
diff --git a/mmdet/utils/ascend_util.py b/mmdet/utils/ascend_util.py
new file mode 100755
index 0000000..df90dec
--- /dev/null
+++ b/mmdet/utils/ascend_util.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def masked_fill(ori_tensor, mask, new_value, neg=False):
+    """The Value of ori_tensor is new_value, depending on mask.
+
+    Args:
+        ori_tensor (Tensor): Input tensor.
+        mask (Tensor): If select new_value.
+        new_value(Tensor | scalar): Value selected for ori_tensor.
+        neg (bool): If True, select ori_tensor. If False, select new_value.
+    Returns:
+        ori_tensor: (Tensor): The Value of ori_tensor is new_value,
+            depending on mask.
+    """
+    if mask is None:
+        return ori_tensor
+    else:
+        if neg:
+            return ori_tensor * mask + new_value * (1 - mask)
+        else:
+            return ori_tensor * (1 - mask) + new_value * mask
+
+
+def batch_images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]  or
+    target_imgs -> [target_level0, target_level1, ...]
+    Args:
+        target (Tensor | List[Tensor]): Tensor split to image levels.
+        num_levels (List[int]): Image levels num.
+    Returns:
+        level_targets: (Tensor): Tensor split by image levels.
+    """
+    if not isinstance(target, torch.Tensor):
+        target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def get_max_num_gt_division_factor(gt_nums,
+                                   min_num_gt=32,
+                                   max_num_gt=1024,
+                                   division_factor=2):
+    """Count max num of gt.
+
+    Args:
+        gt_nums (List[int]):  Ground truth bboxes num of images.
+        min_num_gt (int): Min num of ground truth bboxes.
+        max_num_gt (int): Max num of ground truth bboxes.
+        division_factor (int): Division factor of result.
+    Returns:
+        max_gt_nums_align: (int): max num of ground truth bboxes.
+    """
+    max_gt_nums = max(gt_nums)
+    max_gt_nums_align = min_num_gt
+    while max_gt_nums_align < max_gt_nums:
+        max_gt_nums_align *= division_factor
+    if max_gt_nums_align > max_num_gt:
+        raise RuntimeError
+    return max_gt_nums_align
diff --git a/mmdet/utils/collect_env.py b/mmdet/utils/collect_env.py
new file mode 100755
index 0000000..97e25c0
--- /dev/null
+++ b/mmdet/utils/collect_env.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import collect_env as collect_base_env
+from mmcv.utils import get_git_hash
+
+import mmdet
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMDetection'] = mmdet.__version__ + '+' + get_git_hash()[:7]
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmdet/utils/compat_config.py b/mmdet/utils/compat_config.py
new file mode 100755
index 0000000..05aa37d
--- /dev/null
+++ b/mmdet/utils/compat_config.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+from mmcv import ConfigDict
+
+
+def compat_cfg(cfg):
+    """This function would modify some filed to keep the compatibility of
+    config.
+
+    For example, it will move some args which will be deprecated to the correct
+    fields.
+    """
+    cfg = copy.deepcopy(cfg)
+    cfg = compat_imgs_per_gpu(cfg)
+    cfg = compat_loader_args(cfg)
+    cfg = compat_runner_args(cfg)
+    return cfg
+
+
+def compat_runner_args(cfg):
+    if 'runner' not in cfg:
+        cfg.runner = ConfigDict({
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        })
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    return cfg
+
+
+def compat_imgs_per_gpu(cfg):
+    cfg = copy.deepcopy(cfg)
+    if 'imgs_per_gpu' in cfg.data:
+        warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                      'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            warnings.warn(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                          f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+    return cfg
+
+
+def compat_loader_args(cfg):
+    """Deprecated sample_per_gpu in cfg.data."""
+
+    cfg = copy.deepcopy(cfg)
+    if 'train_dataloader' not in cfg.data:
+        cfg.data['train_dataloader'] = ConfigDict()
+    if 'val_dataloader' not in cfg.data:
+        cfg.data['val_dataloader'] = ConfigDict()
+    if 'test_dataloader' not in cfg.data:
+        cfg.data['test_dataloader'] = ConfigDict()
+
+    # special process for train_dataloader
+    if 'samples_per_gpu' in cfg.data:
+
+        samples_per_gpu = cfg.data.pop('samples_per_gpu')
+        assert 'samples_per_gpu' not in \
+               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    if 'persistent_workers' in cfg.data:
+
+        persistent_workers = cfg.data.pop('persistent_workers')
+        assert 'persistent_workers' not in \
+               cfg.data.train_dataloader, ('`persistent_workers` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['persistent_workers'] = persistent_workers
+
+    if 'workers_per_gpu' in cfg.data:
+
+        workers_per_gpu = cfg.data.pop('workers_per_gpu')
+        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu
+
+    # special process for val_dataloader
+    if 'samples_per_gpu' in cfg.data.val:
+        # keep default value of `sample_per_gpu` is 1
+        assert 'samples_per_gpu' not in \
+               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
+                                         'in `data.val` field and ` '
+                                         'data.val_dataloader` at '
+                                         'the same time. '
+                                         'Please only set it in '
+                                         '`data.val_dataloader`. ')
+        cfg.data.val_dataloader['samples_per_gpu'] = \
+            cfg.data.val.pop('samples_per_gpu')
+    # special process for val_dataloader
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        if 'samples_per_gpu' in cfg.data.test:
+            assert 'samples_per_gpu' not in \
+                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                              'in `data.test` field and ` '
+                                              'data.test_dataloader` '
+                                              'at the same time. '
+                                              'Please only set it in '
+                                              '`data.test_dataloader`. ')
+
+            cfg.data.test_dataloader['samples_per_gpu'] = \
+                cfg.data.test.pop('samples_per_gpu')
+
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            if 'samples_per_gpu' in ds_cfg:
+                assert 'samples_per_gpu' not in \
+                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                                  'in `data.test` field and ` '
+                                                  'data.test_dataloader` at'
+                                                  ' the same time. '
+                                                  'Please only set it in '
+                                                  '`data.test_dataloader`. ')
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    return cfg
diff --git a/mmdet/utils/contextmanagers.py b/mmdet/utils/contextmanagers.py
new file mode 100755
index 0000000..fa12bfc
--- /dev/null
+++ b/mmdet/utils/contextmanagers.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """Async context manager that waits for work to complete on given CUDA
+    streams."""
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert (grad_enabled_before == grad_enabled_after
+                ), 'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug(
+                    '%s %s completed: %s streams: %s',
+                    trace_name,
+                    name,
+                    are_done,
+                    streams,
+                )
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += f' {stream} {elapsed_time:.2f} ms'
+            logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time,
+                        stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/mmdet/utils/logger.py b/mmdet/utils/logger.py
new file mode 100755
index 0000000..485f641
--- /dev/null
+++ b/mmdet/utils/logger.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Get root logger.
+
+    Args:
+        log_file (str, optional): File path of log. Defaults to None.
+        log_level (int, optional): The level of logger.
+            Defaults to logging.INFO.
+
+    Returns:
+        :obj:`logging.Logger`: The obtained logger
+    """
+    logger = get_logger(name='mmdet', log_file=log_file, log_level=log_level)
+
+    return logger
+
+
+def get_caller_name():
+    """Get name of caller method."""
+    # this_func_frame = inspect.stack()[0][0]  # i.e., get_caller_name
+    # callee_frame = inspect.stack()[1][0]  # e.g., log_img_scale
+    caller_frame = inspect.stack()[2][0]  # e.g., caller of log_img_scale
+    caller_method = caller_frame.f_code.co_name
+    try:
+        caller_class = caller_frame.f_locals['self'].__class__.__name__
+        return f'{caller_class}.{caller_method}'
+    except KeyError:  # caller is a function
+        return caller_method
+
+
+def log_img_scale(img_scale, shape_order='hw', skip_square=False):
+    """Log image size.
+
+    Args:
+        img_scale (tuple): Image size to be logged.
+        shape_order (str, optional): The order of image shape.
+            'hw' for (height, width) and 'wh' for (width, height).
+            Defaults to 'hw'.
+        skip_square (bool, optional): Whether to skip logging for square
+            img_scale. Defaults to False.
+
+    Returns:
+        bool: Whether to have done logging.
+    """
+    if shape_order == 'hw':
+        height, width = img_scale
+    elif shape_order == 'wh':
+        width, height = img_scale
+    else:
+        raise ValueError(f'Invalid shape_order {shape_order}.')
+
+    if skip_square and (height == width):
+        return False
+
+    logger = get_root_logger()
+    caller = get_caller_name()
+    logger.info(f'image shape: height={height}, width={width} in {caller}')
+
+    return True
diff --git a/mmdet/utils/memory.py b/mmdet/utils/memory.py
new file mode 100755
index 0000000..eb212bc
--- /dev/null
+++ b/mmdet/utils/memory.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import abc
+from contextlib import contextmanager
+from functools import wraps
+
+import torch
+
+from mmdet.utils import get_root_logger
+
+
+def cast_tensor_type(inputs, src_type=None, dst_type=None):
+    """Recursively convert Tensor in inputs from ``src_type`` to ``dst_type``.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype | torch.device): Source type.
+        src_type (torch.dtype | torch.device): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    assert dst_type is not None
+    if isinstance(inputs, torch.Tensor):
+        if isinstance(dst_type, torch.device):
+            # convert Tensor to dst_device
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'device') and \
+                    (inputs.device == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        else:
+            # convert Tensor to dst_dtype
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'dtype') and \
+                    (inputs.dtype == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type=src_type, dst_type=dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type=src_type, dst_type=dst_type)
+            for item in inputs)
+    # TODO: Currently not supported
+    # elif isinstance(inputs, InstanceData):
+    #     for key, value in inputs.items():
+    #         inputs[key] = cast_tensor_type(
+    #             value, src_type=src_type, dst_type=dst_type)
+    #     return inputs
+    else:
+        return inputs
+
+
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """A context which ignores CUDA OOM exception from pytorch.
+
+    Code is modified from
+    <https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py>  # noqa: E501
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if 'CUDA out of memory. ' in str(e):
+            pass
+        else:
+            raise
+
+
+class AvoidOOM:
+    """Try to convert inputs to FP16 and CPU if got a PyTorch's CUDA Out of
+    Memory error. It will do the following steps:
+
+        1. First retry after calling `torch.cuda.empty_cache()`.
+        2. If that still fails, it will then retry by converting inputs
+          to FP16.
+        3. If that still fails trying to convert inputs to CPUs.
+          In this case, it expects the function to dispatch to
+          CPU implementation.
+
+    Args:
+        to_cpu (bool): Whether to convert outputs to CPU if get an OOM
+            error. This will slow down the code significantly.
+            Defaults to True.
+        test (bool): Skip `_ignore_torch_cuda_oom` operate that can use
+            lightweight data in unit test, only used in
+            test unit. Defaults to False.
+
+    Examples:
+        >>> from mmdet.utils.memory import AvoidOOM
+        >>> AvoidCUDAOOM = AvoidOOM()
+        >>> output = AvoidOOM.retry_if_cuda_oom(
+        >>>     some_torch_function)(input1, input2)
+        >>> # To use as a decorator
+        >>> # from mmdet.utils import AvoidCUDAOOM
+        >>> @AvoidCUDAOOM.retry_if_cuda_oom
+        >>> def function(*args, **kwargs):
+        >>>     return None
+    ```
+
+    Note:
+        1. The output may be on CPU even if inputs are on GPU. Processing
+            on CPU will slow down the code significantly.
+        2. When converting inputs to CPU, it will only look at each argument
+            and check if it has `.device` and `.to` for conversion. Nested
+            structures of tensors are not supported.
+        3. Since the function might be called more than once, it has to be
+            stateless.
+    """
+
+    def __init__(self, to_cpu=True, test=False):
+        self.to_cpu = to_cpu
+        self.test = test
+
+    def retry_if_cuda_oom(self, func):
+        """Makes a function retry itself after encountering pytorch's CUDA OOM
+        error.
+
+        The implementation logic is referred to
+        https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py
+
+        Args:
+            func: a stateless callable that takes tensor-like objects
+                as arguments.
+        Returns:
+            func: a callable which retries `func` if OOM is encountered.
+        """  # noqa: W605
+
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+
+            # raw function
+            if not self.test:
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+                # Clear cache and retry
+                torch.cuda.empty_cache()
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+            # get the type and device of first tensor
+            dtype, device = None, None
+            values = args + tuple(kwargs.values())
+            for value in values:
+                if isinstance(value, torch.Tensor):
+                    dtype = value.dtype
+                    device = value.device
+                    break
+            if dtype is None or device is None:
+                raise ValueError('There is no tensor in the inputs, '
+                                 'cannot get dtype and device.')
+
+            # Convert to FP16
+            fp16_args = cast_tensor_type(args, dst_type=torch.half)
+            fp16_kwargs = cast_tensor_type(kwargs, dst_type=torch.half)
+            logger = get_root_logger()
+            logger.warning(f'Attempting to copy inputs of {str(func)} '
+                           'to FP16 due to CUDA OOM')
+
+            # get input tensor type, the output type will same as
+            # the first parameter type.
+            with _ignore_torch_cuda_oom():
+                output = func(*fp16_args, **fp16_kwargs)
+                output = cast_tensor_type(
+                    output, src_type=torch.half, dst_type=dtype)
+                if not self.test:
+                    return output
+            logger.warning('Using FP16 still meet CUDA OOM')
+
+            # Try on CPU. This will slow down the code significantly,
+            # therefore print a notice.
+            if self.to_cpu:
+                logger.warning(f'Attempting to copy inputs of {str(func)} '
+                               'to CPU due to CUDA OOM')
+                cpu_device = torch.empty(0).device
+                cpu_args = cast_tensor_type(args, dst_type=cpu_device)
+                cpu_kwargs = cast_tensor_type(kwargs, dst_type=cpu_device)
+
+                # convert outputs to GPU
+                with _ignore_torch_cuda_oom():
+                    logger.warning(f'Convert outputs to GPU (device={device})')
+                    output = func(*cpu_args, **cpu_kwargs)
+                    output = cast_tensor_type(
+                        output, src_type=cpu_device, dst_type=device)
+                    return output
+
+                warnings.warn('Cannot convert output to GPU due to CUDA OOM, '
+                              'the output is now on CPU, which might cause '
+                              'errors if the output need to interact with GPU '
+                              'data in subsequent operations')
+                logger.warning('Cannot convert output to GPU due to '
+                               'CUDA OOM, the output is on CPU now.')
+
+                return func(*cpu_args, **cpu_kwargs)
+            else:
+                # may still get CUDA OOM error
+                return func(*args, **kwargs)
+
+        return wrapped
+
+
+# To use AvoidOOM as a decorator
+AvoidCUDAOOM = AvoidOOM()
diff --git a/mmdet/utils/misc.py b/mmdet/utils/misc.py
new file mode 100755
index 0000000..2017cbb
--- /dev/null
+++ b/mmdet/utils/misc.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os
+import os.path as osp
+import warnings
+
+import mmcv
+import torch
+from mmcv.utils import TORCH_VERSION, digit_version, print_log
+
+
+def find_latest_checkpoint(path, suffix='pth'):
+    """Find the latest checkpoint from the working directory.
+
+    Args:
+        path(str): The path to find checkpoints.
+        suffix(str): File extension.
+            Defaults to pth.
+
+    Returns:
+        latest_path(str | None): File path of the latest checkpoint.
+    References:
+        .. [1] https://github.com/microsoft/SoftTeacher
+                  /blob/main/ssod/utils/patch.py
+    """
+    if not osp.exists(path):
+        warnings.warn('The path of checkpoints does not exist.')
+        return None
+    if osp.exists(osp.join(path, f'latest.{suffix}')):
+        return osp.join(path, f'latest.{suffix}')
+
+    checkpoints = glob.glob(osp.join(path, f'*.{suffix}'))
+    if len(checkpoints) == 0:
+        warnings.warn('There are no checkpoints in the path.')
+        return None
+    latest = -1
+    latest_path = None
+    for checkpoint in checkpoints:
+        count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0])
+        if count > latest:
+            latest = count
+            latest_path = checkpoint
+    return latest_path
+
+
+def update_data_root(cfg, logger=None):
+    """Update data root according to env MMDET_DATASETS.
+
+    If set env MMDET_DATASETS, update cfg.data_root according to
+    MMDET_DATASETS. Otherwise, using cfg.data_root as default.
+
+    Args:
+        cfg (mmcv.Config): The model config need to modify
+        logger (logging.Logger | str | None): the way to print msg
+    """
+    assert isinstance(cfg, mmcv.Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmcv.Config'
+
+    if 'MMDET_DATASETS' in os.environ:
+        dst_root = os.environ['MMDET_DATASETS']
+        print_log(f'MMDET_DATASETS has been set to be {dst_root}.'
+                  f'Using {dst_root} as data root.')
+    else:
+        return
+
+    assert isinstance(cfg, mmcv.Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmcv.Config'
+
+    def update(cfg, src_str, dst_str):
+        for k, v in cfg.items():
+            if isinstance(v, mmcv.ConfigDict):
+                update(cfg[k], src_str, dst_str)
+            if isinstance(v, str) and src_str in v:
+                cfg[k] = v.replace(src_str, dst_str)
+
+    update(cfg.data, cfg.data_root, dst_root)
+    cfg.data_root = dst_root
+
+
+_torch_version_div_indexing = (
+    'parrots' not in TORCH_VERSION
+    and digit_version(TORCH_VERSION) >= digit_version('1.8'))
+
+
+def floordiv(dividend, divisor, rounding_mode='trunc'):
+    if _torch_version_div_indexing:
+        return torch.div(dividend, divisor, rounding_mode=rounding_mode)
+    else:
+        return dividend // divisor
diff --git a/mmdet/utils/profiling.py b/mmdet/utils/profiling.py
new file mode 100755
index 0000000..2f53f45
--- /dev/null
+++ b/mmdet/utils/profiling.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import contextlib
+import sys
+import time
+
+import torch
+
+if sys.version_info >= (3, 7):
+
+    @contextlib.contextmanager
+    def profile_time(trace_name,
+                     name,
+                     enabled=True,
+                     stream=None,
+                     end_stream=None):
+        """Print time spent by CPU and GPU.
+
+        Useful as a temporary context manager to find sweet spots of code
+        suitable for async implementation.
+        """
+        if (not enabled) or not torch.cuda.is_available():
+            yield
+            return
+        stream = stream if stream else torch.cuda.current_stream()
+        end_stream = end_stream if end_stream else stream
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start)
+        try:
+            cpu_start = time.monotonic()
+            yield
+        finally:
+            cpu_end = time.monotonic()
+            end_stream.record_event(end)
+            end.synchronize()
+            cpu_time = (cpu_end - cpu_start) * 1000
+            gpu_time = start.elapsed_time(end)
+            msg = f'{trace_name} {name} cpu_time {cpu_time:.2f} ms '
+            msg += f'gpu_time {gpu_time:.2f} ms stream {stream}'
+            print(msg, end_stream)
diff --git a/mmdet/utils/replace_cfg_vals.py b/mmdet/utils/replace_cfg_vals.py
new file mode 100755
index 0000000..6ca301d
--- /dev/null
+++ b/mmdet/utils/replace_cfg_vals.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+
+from mmcv.utils import Config
+
+
+def replace_cfg_vals(ori_cfg):
+    """Replace the string "${key}" with the corresponding value.
+
+    Replace the "${key}" with the value of ori_cfg.key in the config. And
+    support replacing the chained ${key}. Such as, replace "${key0.key1}"
+    with the value of cfg.key0.key1. Code is modified from `vars.py
+    < https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/vars.py>`_  # noqa: E501
+
+    Args:
+        ori_cfg (mmcv.utils.config.Config):
+            The origin config with "${key}" generated from a file.
+
+    Returns:
+        updated_cfg [mmcv.utils.config.Config]:
+            The config with "${key}" replaced by the corresponding value.
+    """
+
+    def get_value(cfg, key):
+        for k in key.split('.'):
+            cfg = cfg[k]
+        return cfg
+
+    def replace_value(cfg):
+        if isinstance(cfg, dict):
+            return {key: replace_value(value) for key, value in cfg.items()}
+        elif isinstance(cfg, list):
+            return [replace_value(item) for item in cfg]
+        elif isinstance(cfg, tuple):
+            return tuple([replace_value(item) for item in cfg])
+        elif isinstance(cfg, str):
+            # the format of string cfg may be:
+            # 1) "${key}", which will be replaced with cfg.key directly
+            # 2) "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx",
+            # which will be replaced with the string of the cfg.key
+            keys = pattern_key.findall(cfg)
+            values = [get_value(ori_cfg, key[2:-1]) for key in keys]
+            if len(keys) == 1 and keys[0] == cfg:
+                # the format of string cfg is "${key}"
+                cfg = values[0]
+            else:
+                for key, value in zip(keys, values):
+                    # the format of string cfg is
+                    # "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx"
+                    assert not isinstance(value, (dict, list, tuple)), \
+                        f'for the format of string cfg is ' \
+                        f"'xxxxx${key}xxxxx' or 'xxx${key}xxx${key}xxx', " \
+                        f"the type of the value of '${key}' " \
+                        f'can not be dict, list, or tuple' \
+                        f'but you input {type(value)} in {cfg}'
+                    cfg = cfg.replace(key, str(value))
+            return cfg
+        else:
+            return cfg
+
+    # the pattern of string "${key}"
+    pattern_key = re.compile(r'\$\{[a-zA-Z\d_.]*\}')
+    # the type of ori_cfg._cfg_dict is mmcv.utils.config.ConfigDict
+    updated_cfg = Config(
+        replace_value(ori_cfg._cfg_dict), filename=ori_cfg.filename)
+    # replace the model with model_wrapper
+    if updated_cfg.get('model_wrapper', None) is not None:
+        updated_cfg.model = updated_cfg.model_wrapper
+        updated_cfg.pop('model_wrapper')
+    return updated_cfg
diff --git a/mmdet/utils/rfnext.py b/mmdet/utils/rfnext.py
new file mode 100755
index 0000000..568f3d3
--- /dev/null
+++ b/mmdet/utils/rfnext.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+try:
+    from mmcv.cnn import RFSearchHook
+except ImportError:
+    RFSearchHook = None
+
+
+def rfnext_init_model(detector, cfg):
+    """Rcecptive field search via dilation rates.
+
+    Please refer to `RF-Next: Efficient Receptive Field
+    Search for Convolutional Neural Networks
+    <https://arxiv.org/abs/2206.06637>`_ for more details.
+
+    Args:
+        detector (nn.Module): The detector before initializing RF-Next.
+        cfg (mmcv.Config): The config for RF-Next.
+            If the RFSearchHook is defined in the cfg.custom_hooks,
+            the detector will be initialized for RF-Next.
+    """
+
+    if cfg.get('custom_hooks', None) is None:
+        return
+    custom_hook_types = [hook['type'] for hook in cfg.custom_hooks]
+    if 'RFSearchHook' not in custom_hook_types:
+        return
+
+    index = custom_hook_types.index('RFSearchHook')
+    rfsearch_cfg = cfg.custom_hooks[index]
+    assert rfsearch_cfg['type'] == 'RFSearchHook'
+
+    assert RFSearchHook is not None, 'Please install mmcv > 1.7.0'
+
+    # initlize a RFSearchHook
+    rfsearch_warp = RFSearchHook(
+        mode=rfsearch_cfg.get('mode', 'search'),
+        config=rfsearch_cfg.get('config', None),
+        rfstructure_file=rfsearch_cfg.get('rfstructure_file', None),
+        by_epoch=rfsearch_cfg.get('by_epoch', True),
+        verbose=rfsearch_cfg.get('verbose', True),
+    )
+    rfsearch_warp.init_model(detector)
+    rfsearch_cfg['rfstructure_file'] = None
diff --git a/mmdet/utils/setup_env.py b/mmdet/utils/setup_env.py
new file mode 100755
index 0000000..6637cf8
--- /dev/null
+++ b/mmdet/utils/setup_env.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    workers_per_gpu = cfg.data.get('workers_per_gpu', 1)
+    if 'train_dataloader' in cfg.data:
+        workers_per_gpu = \
+            max(cfg.data.train_dataloader.get('workers_per_gpu', 1),
+                workers_per_gpu)
+
+    if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/mmdet/utils/split_batch.py b/mmdet/utils/split_batch.py
new file mode 100755
index 0000000..0276fb3
--- /dev/null
+++ b/mmdet/utils/split_batch.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def split_batch(img, img_metas, kwargs):
+    """Split data_batch by tags.
+
+    Code is modified from
+    <https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/structure_utils.py> # noqa: E501
+
+    Args:
+        img (Tensor): of shape (N, C, H, W) encoding input images.
+            Typically these should be mean centered and std scaled.
+        img_metas (list[dict]): List of image info dict where each dict
+            has: 'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys, see
+            :class:`mmdet.datasets.pipelines.Collect`.
+        kwargs (dict): Specific to concrete implementation.
+
+    Returns:
+        data_groups (dict): a dict that data_batch splited by tags,
+            such as 'sup', 'unsup_teacher', and 'unsup_student'.
+    """
+
+    # only stack img in the batch
+    def fuse_list(obj_list, obj):
+        return torch.stack(obj_list) if isinstance(obj,
+                                                   torch.Tensor) else obj_list
+
+    # select data with tag from data_batch
+    def select_group(data_batch, current_tag):
+        group_flag = [tag == current_tag for tag in data_batch['tag']]
+        return {
+            k: fuse_list([vv for vv, gf in zip(v, group_flag) if gf], v)
+            for k, v in data_batch.items()
+        }
+
+    kwargs.update({'img': img, 'img_metas': img_metas})
+    kwargs.update({'tag': [meta['tag'] for meta in img_metas]})
+    tags = list(set(kwargs['tag']))
+    data_groups = {tag: select_group(kwargs, tag) for tag in tags}
+    for tag, group in data_groups.items():
+        group.pop('tag')
+    return data_groups
diff --git a/mmdet/utils/util_distribution.py b/mmdet/utils/util_distribution.py
new file mode 100755
index 0000000..ba32cc9
--- /dev/null
+++ b/mmdet/utils/util_distribution.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+
+dp_factory = {'cuda': MMDataParallel, 'cpu': MMDataParallel}
+
+ddp_factory = {'cuda': MMDistributedDataParallel}
+
+
+def build_dp(model, device='cuda', dim=0, *args, **kwargs):
+    """build DataParallel module by device type.
+
+    if device is cuda, return a MMDataParallel model; if device is mlu,
+    return a MLUDataParallel model.
+
+    Args:
+        model (:class:`nn.Module`): model to be parallelized.
+        device (str): device type, cuda, cpu or mlu. Defaults to cuda.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+
+    Returns:
+        nn.Module: the model to be parallelized.
+    """
+    if device == 'npu':
+        from mmcv.device.npu import NPUDataParallel
+        dp_factory['npu'] = NPUDataParallel
+        torch.npu.set_device(kwargs['device_ids'][0])
+        torch.npu.set_compile_mode(jit_compile=False)
+        model = model.npu()
+    elif device == 'cuda':
+        model = model.cuda(kwargs['device_ids'][0])
+    elif device == 'mlu':
+        from mmcv.device.mlu import MLUDataParallel
+        dp_factory['mlu'] = MLUDataParallel
+        model = model.mlu()
+
+    return dp_factory[device](model, dim=dim, *args, **kwargs)
+
+
+def build_ddp(model, device='cuda', *args, **kwargs):
+    """Build DistributedDataParallel module by device type.
+
+    If device is cuda, return a MMDistributedDataParallel model;
+    if device is mlu, return a MLUDistributedDataParallel model.
+
+    Args:
+        model (:class:`nn.Module`): module to be parallelized.
+        device (str): device type, mlu or cuda.
+
+    Returns:
+        :class:`nn.Module`: the module to be parallelized
+
+    References:
+        .. [1] https://pytorch.org/docs/stable/generated/torch.nn.parallel.
+                     DistributedDataParallel.html
+    """
+    assert device in ['cuda', 'mlu',
+                      'npu'], 'Only available for cuda or mlu or npu devices.'
+    if device == 'npu':
+        from mmcv.device.npu import NPUDistributedDataParallel
+        torch.npu.set_compile_mode(jit_compile=False)
+        ddp_factory['npu'] = NPUDistributedDataParallel
+        model = model.npu()
+    elif device == 'cuda':
+        model = model.cuda()
+    elif device == 'mlu':
+        from mmcv.device.mlu import MLUDistributedDataParallel
+        ddp_factory['mlu'] = MLUDistributedDataParallel
+        model = model.mlu()
+
+    return ddp_factory[device](model, *args, **kwargs)
+
+
+def is_npu_available():
+    """Returns a bool indicating if NPU is currently available."""
+    return hasattr(torch, 'npu') and torch.npu.is_available()
+
+
+def is_mlu_available():
+    """Returns a bool indicating if MLU is currently available."""
+    return hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()
+
+
+def get_device():
+    """Returns an available device, cpu, cuda or mlu."""
+    is_device_available = {
+        'npu': is_npu_available(),
+        'cuda': torch.cuda.is_available(),
+        'mlu': is_mlu_available()
+    }
+    device_list = [k for k, v in is_device_available.items() if v]
+    return device_list[0] if len(device_list) >= 1 else 'cpu'
diff --git a/mmdet/utils/util_mixins.py b/mmdet/utils/util_mixins.py
new file mode 100755
index 0000000..b83b661
--- /dev/null
+++ b/mmdet/utils/util_mixins.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This module defines the :class:`NiceRepr` mixin class, which defines a
+``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__``
+method, which you must define. This means you only have to overload one
+function instead of two.  Furthermore, if the object defines a ``__len__``
+method, then the ``__nice__`` method defaults to something sensible, otherwise
+it is treated as abstract and raises ``NotImplementedError``.
+
+To use simply have your object inherit from :class:`NiceRepr`
+(multi-inheritance should be ok).
+
+This code was copied from the ubelt library: https://github.com/Erotemic/ubelt
+
+Example:
+    >>> # Objects that define __nice__ have a default __str__ and __repr__
+    >>> class Student(NiceRepr):
+    ...    def __init__(self, name):
+    ...        self.name = name
+    ...    def __nice__(self):
+    ...        return self.name
+    >>> s1 = Student('Alice')
+    >>> s2 = Student('Bob')
+    >>> print(f's1 = {s1}')
+    >>> print(f's2 = {s2}')
+    s1 = <Student(Alice)>
+    s2 = <Student(Bob)>
+
+Example:
+    >>> # Objects that define __len__ have a default __nice__
+    >>> class Group(NiceRepr):
+    ...    def __init__(self, data):
+    ...        self.data = data
+    ...    def __len__(self):
+    ...        return len(self.data)
+    >>> g = Group([1, 2, 3])
+    >>> print(f'g = {g}')
+    g = <Group(3)>
+"""
+import warnings
+
+
+class NiceRepr:
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
diff --git a/mmdet/utils/util_random.py b/mmdet/utils/util_random.py
new file mode 100755
index 0000000..dc1ecb6
--- /dev/null
+++ b/mmdet/utils/util_random.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Helpers for random number generators."""
+import numpy as np
+
+
+def ensure_rng(rng=None):
+    """Coerces input into a random number generator.
+
+    If the input is None, then a global random state is returned.
+
+    If the input is a numeric value, then that is used as a seed to construct a
+    random state. Otherwise the input is returned as-is.
+
+    Adapted from [1]_.
+
+    Args:
+        rng (int | numpy.random.RandomState | None):
+            if None, then defaults to the global rng. Otherwise this can be an
+            integer or a RandomState class
+    Returns:
+        (numpy.random.RandomState) : rng -
+            a numpy random number generator
+
+    References:
+        .. [1] https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270  # noqa: E501
+    """
+
+    if rng is None:
+        rng = np.random.mtrand._rand
+    elif isinstance(rng, int):
+        rng = np.random.RandomState(rng)
+    else:
+        rng = rng
+    return rng
diff --git a/mmdet/version.py b/mmdet/version.py
new file mode 100755
index 0000000..fecd645
--- /dev/null
+++ b/mmdet/version.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+__version__ = '2.28.2'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/model-index.yml b/model-index.yml
new file mode 100755
index 0000000..587255b
--- /dev/null
+++ b/model-index.yml
@@ -0,0 +1,73 @@
+Import:
+  - configs/atss/metafile.yml
+  - configs/autoassign/metafile.yml
+  - configs/carafe/metafile.yml
+  - configs/cascade_rcnn/metafile.yml
+  - configs/cascade_rpn/metafile.yml
+  - configs/centernet/metafile.yml
+  - configs/centripetalnet/metafile.yml
+  - configs/cornernet/metafile.yml
+  - configs/convnext/metafile.yml
+  - configs/dcn/metafile.yml
+  - configs/dcnv2/metafile.yml
+  - configs/deformable_detr/metafile.yml
+  - configs/detectors/metafile.yml
+  - configs/detr/metafile.yml
+  - configs/double_heads/metafile.yml
+  - configs/dyhead/metafile.yml
+  - configs/dynamic_rcnn/metafile.yml
+  - configs/efficientnet/metafile.yml
+  - configs/empirical_attention/metafile.yml
+  - configs/faster_rcnn/metafile.yml
+  - configs/fcos/metafile.yml
+  - configs/foveabox/metafile.yml
+  - configs/fpg/metafile.yml
+  - configs/free_anchor/metafile.yml
+  - configs/fsaf/metafile.yml
+  - configs/gcnet/metafile.yml
+  - configs/gfl/metafile.yml
+  - configs/ghm/metafile.yml
+  - configs/gn/metafile.yml
+  - configs/gn+ws/metafile.yml
+  - configs/grid_rcnn/metafile.yml
+  - configs/groie/metafile.yml
+  - configs/guided_anchoring/metafile.yml
+  - configs/hrnet/metafile.yml
+  - configs/htc/metafile.yml
+  - configs/instaboost/metafile.yml
+  - configs/lad/metafile.yml
+  - configs/ld/metafile.yml
+  - configs/libra_rcnn/metafile.yml
+  - configs/mask_rcnn/metafile.yml
+  - configs/ms_rcnn/metafile.yml
+  - configs/nas_fcos/metafile.yml
+  - configs/nas_fpn/metafile.yml
+  - configs/openimages/metafile.yml
+  - configs/paa/metafile.yml
+  - configs/pafpn/metafile.yml
+  - configs/panoptic_fpn/metafile.yml
+  - configs/pvt/metafile.yml
+  - configs/pisa/metafile.yml
+  - configs/point_rend/metafile.yml
+  - configs/queryinst/metafile.yml
+  - configs/regnet/metafile.yml
+  - configs/reppoints/metafile.yml
+  - configs/res2net/metafile.yml
+  - configs/resnest/metafile.yml
+  - configs/retinanet/metafile.yml
+  - configs/sabl/metafile.yml
+  - configs/scnet/metafile.yml
+  - configs/scratch/metafile.yml
+  - configs/seesaw_loss/metafile.yml
+  - configs/sparse_rcnn/metafile.yml
+  - configs/solo/metafile.yml
+  - configs/ssd/metafile.yml
+  - configs/swin/metafile.yml
+  - configs/tridentnet/metafile.yml
+  - configs/tood/metafile.yml
+  - configs/vfnet/metafile.yml
+  - configs/yolact/metafile.yml
+  - configs/yolo/metafile.yml
+  - configs/yolof/metafile.yml
+  - configs/yolox/metafile.yml
+  - configs/rfnext/metafile.yml
diff --git a/pytest.ini b/pytest.ini
new file mode 100755
index 0000000..9796e87
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+addopts = --xdoctest --xdoctest-style=auto
+norecursedirs = .git ignore build __pycache__ data docker docs .eggs
+
+filterwarnings= default
+                ignore:.*No cfgstr given in Cacher constructor or call.*:Warning
+                ignore:.*Define the __nice__ method for.*:Warning
diff --git a/requirements.txt b/requirements.txt
new file mode 100755
index 0000000..6981bd7
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
diff --git a/requirements/albu.txt b/requirements/albu.txt
new file mode 100755
index 0000000..f421fbb
--- /dev/null
+++ b/requirements/albu.txt
@@ -0,0 +1 @@
+albumentations>=0.3.2 --no-binary qudida,albumentations
diff --git a/requirements/build.txt b/requirements/build.txt
new file mode 100755
index 0000000..8155829
--- /dev/null
+++ b/requirements/build.txt
@@ -0,0 +1,3 @@
+# These must be installed before building mmdetection
+cython
+numpy
diff --git a/requirements/docs.txt b/requirements/docs.txt
new file mode 100755
index 0000000..b562600
--- /dev/null
+++ b/requirements/docs.txt
@@ -0,0 +1,8 @@
+docutils==0.16.0
+markdown>=3.4.0
+myst-parser
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==5.3.0
+sphinx-copybutton
+sphinx_markdown_tables>=0.0.17
+sphinx_rtd_theme
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
new file mode 100755
index 0000000..b53dbf4
--- /dev/null
+++ b/requirements/mminstall.txt
@@ -0,0 +1 @@
+mmcv-full>=1.3.17
diff --git a/requirements/optional.txt b/requirements/optional.txt
new file mode 100755
index 0000000..4f0065a
--- /dev/null
+++ b/requirements/optional.txt
@@ -0,0 +1,3 @@
+cityscapesscripts
+imagecorruptions
+scikit-learn
diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt
new file mode 100755
index 0000000..e1bf21b
--- /dev/null
+++ b/requirements/readthedocs.txt
@@ -0,0 +1,4 @@
+mmcv
+scipy
+torch
+torchvision
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
new file mode 100755
index 0000000..c815aef
--- /dev/null
+++ b/requirements/runtime.txt
@@ -0,0 +1,6 @@
+matplotlib
+numpy
+pycocotools
+scipy
+six
+terminaltables
diff --git a/requirements/tests.txt b/requirements/tests.txt
new file mode 100755
index 0000000..2ff795a
--- /dev/null
+++ b/requirements/tests.txt
@@ -0,0 +1,15 @@
+asynctest
+codecov
+flake8
+interrogate
+isort==4.3.21
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
+-e git+https://github.com/open-mmlab/mmtracking#egg=mmtrack
+onnx==1.7.0
+onnxruntime>=1.8.0
+protobuf<=3.20.1
+pytest
+ubelt
+xdoctest>=0.10.0
+yapf
diff --git a/resources/coco_test_12510.jpg b/resources/coco_test_12510.jpg
new file mode 100755
index 0000000..1271ae1
Binary files /dev/null and b/resources/coco_test_12510.jpg differ
diff --git a/resources/corruptions_sev_3.png b/resources/corruptions_sev_3.png
new file mode 100755
index 0000000..bbbd19a
Binary files /dev/null and b/resources/corruptions_sev_3.png differ
diff --git a/resources/data_pipeline.png b/resources/data_pipeline.png
new file mode 100755
index 0000000..6ac3fee
Binary files /dev/null and b/resources/data_pipeline.png differ
diff --git a/resources/loss_curve.png b/resources/loss_curve.png
new file mode 100755
index 0000000..0242555
Binary files /dev/null and b/resources/loss_curve.png differ
diff --git a/resources/mmdet-logo.png b/resources/mmdet-logo.png
new file mode 100755
index 0000000..a0b6fbd
Binary files /dev/null and b/resources/mmdet-logo.png differ
diff --git a/resources/zhihu_qrcode.jpg b/resources/zhihu_qrcode.jpg
new file mode 100755
index 0000000..c745fb0
Binary files /dev/null and b/resources/zhihu_qrcode.jpg differ
diff --git a/setup.cfg b/setup.cfg
new file mode 100755
index 0000000..6072221
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,21 @@
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = setuptools
+known_first_party = mmdet
+known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+
+[yapf]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
+
+# ignore-words-list needs to be lowercase format. For example, if we want to
+# ignore word "BA", then we need to append "ba" to ignore-words-list rather
+# than "BA"
+[codespell]
+skip = *.ipynb
+quiet-level = 3
+ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood,ba,warmup,nam,dota,DOTA
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..535d90e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import warnings
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'mmdet/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def make_cuda_ext(name, module, sources, sources_cuda=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': []}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+        sources += sources_cuda
+    else:
+        print(f'Compiling {name} without CUDA')
+        extension = CppExtension
+
+    return extension(
+        name=f'{module}.{name}',
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import re
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            elif '@git+' in line:
+                info['package'] = line
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+def add_mim_extension():
+    """Add extra files that are required to support MIM into the package.
+
+    These files will be added by creating a symlink to the originals if the
+    package is installed in `editable` mode (e.g. pip install -e .), or by
+    copying from the originals otherwise.
+    """
+
+    # parse installment mode
+    if 'develop' in sys.argv:
+        # installed by `pip install -e .`
+        if platform.system() == 'Windows':
+            # set `copy` mode here since symlink fails on Windows.
+            mode = 'copy'
+        else:
+            mode = 'symlink'
+    elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        # installed by `pip install .`
+        # or create source distribution by `python setup.py sdist`
+        mode = 'copy'
+    else:
+        return
+
+    filenames = ['tools', 'configs', 'demo', 'model-index.yml']
+    repo_path = osp.dirname(__file__)
+    mim_path = osp.join(repo_path, 'mmdet', '.mim')
+    os.makedirs(mim_path, exist_ok=True)
+
+    for filename in filenames:
+        if osp.exists(filename):
+            src_path = osp.join(repo_path, filename)
+            tar_path = osp.join(mim_path, filename)
+
+            if osp.isfile(tar_path) or osp.islink(tar_path):
+                os.remove(tar_path)
+            elif osp.isdir(tar_path):
+                shutil.rmtree(tar_path)
+
+            if mode == 'symlink':
+                src_relpath = osp.relpath(src_path, osp.dirname(tar_path))
+                os.symlink(src_relpath, tar_path)
+            elif mode == 'copy':
+                if osp.isfile(src_path):
+                    shutil.copyfile(src_path, tar_path)
+                elif osp.isdir(src_path):
+                    shutil.copytree(src_path, tar_path)
+                else:
+                    warnings.warn(f'Cannot copy file {src_path}.')
+            else:
+                raise ValueError(f'Invalid mode {mode}')
+
+
+if __name__ == '__main__':
+    add_mim_extension()
+    setup(
+        name='mmdet',
+        version=get_version(),
+        description='OpenMMLab Detection Toolbox and Benchmark',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        author='MMDetection Contributors',
+        author_email='openmmlab@gmail.com',
+        keywords='computer vision, object detection',
+        url='https://github.com/open-mmlab/mmdetection',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        include_package_data=True,
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+        ],
+        license='Apache License 2.0',
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'all': parse_requirements('requirements.txt'),
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'optional': parse_requirements('requirements/optional.txt'),
+            'mim': parse_requirements('requirements/mminstall.txt'),
+        },
+        ext_modules=[],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
diff --git a/tests/data/VOCdevkit/VOC2007/Annotations/000001.xml b/tests/data/VOCdevkit/VOC2007/Annotations/000001.xml
new file mode 100755
index 0000000..795d398
--- /dev/null
+++ b/tests/data/VOCdevkit/VOC2007/Annotations/000001.xml
@@ -0,0 +1,44 @@
+<annotation>
+	<folder>VOC2007</folder>
+	<filename>000001.jpg</filename>
+	<source>
+		<database>The VOC2007 Database</database>
+		<annotation>PASCAL VOC2007</annotation>
+		<image>flickr</image>
+		<flickrid>341012865</flickrid>
+	</source>
+	<owner>
+		<flickrid>Fried Camels</flickrid>
+		<name>Jinky the Fruit Bat</name>
+	</owner>
+	<size>
+		<width>353</width>
+		<height>500</height>
+		<depth>3</depth>
+	</size>
+	<segmented>0</segmented>
+	<object>
+		<name>dog</name>
+		<pose>Left</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>48</xmin>
+			<ymin>240</ymin>
+			<xmax>195</xmax>
+			<ymax>371</ymax>
+		</bndbox>
+	</object>
+	<object>
+		<name>person</name>
+		<pose>Left</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>8</xmin>
+			<ymin>12</ymin>
+			<xmax>352</xmax>
+			<ymax>498</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/data/VOCdevkit/VOC2007/ImageSets/Main/test.txt b/tests/data/VOCdevkit/VOC2007/ImageSets/Main/test.txt
new file mode 100755
index 0000000..a12b836
--- /dev/null
+++ b/tests/data/VOCdevkit/VOC2007/ImageSets/Main/test.txt
@@ -0,0 +1 @@
+000001
diff --git a/tests/data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt b/tests/data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt
new file mode 100755
index 0000000..a12b836
--- /dev/null
+++ b/tests/data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt
@@ -0,0 +1 @@
+000001
diff --git a/tests/data/VOCdevkit/VOC2007/JPEGImages/000001.jpg b/tests/data/VOCdevkit/VOC2007/JPEGImages/000001.jpg
new file mode 100755
index 0000000..8f96ee5
Binary files /dev/null and b/tests/data/VOCdevkit/VOC2007/JPEGImages/000001.jpg differ
diff --git a/tests/data/VOCdevkit/VOC2012/Annotations/000001.xml b/tests/data/VOCdevkit/VOC2012/Annotations/000001.xml
new file mode 100755
index 0000000..81e70d9
--- /dev/null
+++ b/tests/data/VOCdevkit/VOC2012/Annotations/000001.xml
@@ -0,0 +1,32 @@
+<annotation>
+	<folder>VOC2007</folder>
+	<filename>000002.jpg</filename>
+	<source>
+		<database>The VOC2007 Database</database>
+		<annotation>PASCAL VOC2007</annotation>
+		<image>flickr</image>
+		<flickrid>329145082</flickrid>
+	</source>
+	<owner>
+		<flickrid>hiromori2</flickrid>
+		<name>Hiroyuki Mori</name>
+	</owner>
+	<size>
+		<width>335</width>
+		<height>500</height>
+		<depth>3</depth>
+	</size>
+	<segmented>0</segmented>
+	<object>
+		<name>train</name>
+		<pose>Unspecified</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>139</xmin>
+			<ymin>200</ymin>
+			<xmax>207</xmax>
+			<ymax>301</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/data/VOCdevkit/VOC2012/ImageSets/Main/test.txt b/tests/data/VOCdevkit/VOC2012/ImageSets/Main/test.txt
new file mode 100755
index 0000000..a12b836
--- /dev/null
+++ b/tests/data/VOCdevkit/VOC2012/ImageSets/Main/test.txt
@@ -0,0 +1 @@
+000001
diff --git a/tests/data/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt b/tests/data/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt
new file mode 100755
index 0000000..a12b836
--- /dev/null
+++ b/tests/data/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt
@@ -0,0 +1 @@
+000001
diff --git a/tests/data/VOCdevkit/VOC2012/JPEGImages/000001.jpg b/tests/data/VOCdevkit/VOC2012/JPEGImages/000001.jpg
new file mode 100755
index 0000000..81aafa2
Binary files /dev/null and b/tests/data/VOCdevkit/VOC2012/JPEGImages/000001.jpg differ
diff --git a/tests/data/coco_sample.json b/tests/data/coco_sample.json
new file mode 100755
index 0000000..b66cdf3
--- /dev/null
+++ b/tests/data/coco_sample.json
@@ -0,0 +1,77 @@
+{
+    "images": [
+        {
+            "file_name": "fake1.jpg",
+            "height": 800,
+            "width": 800,
+            "id": 0
+        },
+        {
+            "file_name": "fake2.jpg",
+            "height": 800,
+            "width": 800,
+            "id": 1
+        },
+        {
+            "file_name": "fake3.jpg",
+            "height": 800,
+            "width": 800,
+            "id": 2
+        }
+    ],
+    "annotations": [
+        {
+            "bbox": [
+                0,
+                0,
+                20,
+                20
+            ],
+            "area": 400.00,
+            "score": 1.0,
+            "category_id": 1,
+            "id": 1,
+            "image_id": 0
+        },
+        {
+            "bbox": [
+                0,
+                0,
+                20,
+                20
+            ],
+            "area": 400.00,
+            "score": 1.0,
+            "category_id": 2,
+            "id": 2,
+            "image_id": 0
+        },
+        {
+            "bbox": [
+                0,
+                0,
+                20,
+                20
+            ],
+            "area": 400.00,
+            "score": 1.0,
+            "category_id": 1,
+            "id": 3,
+            "image_id": 1
+        }
+    ],
+    "categories": [
+        {
+            "id": 1,
+            "name": "bus",
+            "supercategory": "none"
+        },
+        {
+            "id": 2,
+            "name": "car",
+            "supercategory": "none"
+        }
+    ],
+    "licenses": [],
+    "info": null
+}
diff --git a/tests/data/color.jpg b/tests/data/color.jpg
new file mode 100755
index 0000000..05d62b8
Binary files /dev/null and b/tests/data/color.jpg differ
diff --git a/tests/data/configs_mmtrack/faster_rcnn_r50_dc5.py b/tests/data/configs_mmtrack/faster_rcnn_r50_dc5.py
new file mode 100755
index 0000000..d0d2dc2
--- /dev/null
+++ b/tests/data/configs_mmtrack/faster_rcnn_r50_dc5.py
@@ -0,0 +1,113 @@
+model = dict(
+    detector=dict(
+        type='FasterRCNN',
+        backbone=dict(
+            type='ResNet',
+            depth=18,
+            base_channels=2,
+            num_stages=4,
+            out_indices=(3, ),
+            strides=(1, 2, 2, 1),
+            dilations=(1, 1, 1, 2),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            style='pytorch'),
+        neck=dict(
+            type='ChannelMapper',
+            in_channels=[16],
+            out_channels=16,
+            kernel_size=3),
+        rpn_head=dict(
+            type='RPNHead',
+            in_channels=16,
+            feat_channels=16,
+            anchor_generator=dict(
+                type='AnchorGenerator',
+                scales=[4, 8, 16, 32],
+                ratios=[0.5, 1.0, 2.0],
+                strides=[16]),
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox=dict(
+                type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+        roi_head=dict(
+            type='StandardRoIHead',
+            bbox_roi_extractor=dict(
+                type='SingleRoIExtractor',
+                roi_layer=dict(
+                    type='RoIAlign', output_size=7, sampling_ratio=2),
+                out_channels=16,
+                featmap_strides=[16]),
+            bbox_head=dict(
+                type='Shared2FCBBoxHead',
+                in_channels=16,
+                fc_out_channels=32,
+                roi_feat_size=7,
+                num_classes=30,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.2, 0.2, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0))),
+        # detector training and testing settings
+        train_cfg=dict(
+            rpn=dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=256,
+                    pos_fraction=0.5,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=False),
+                allowed_border=0,
+                pos_weight=-1,
+                debug=False),
+            rpn_proposal=dict(
+                nms_pre=6000,
+                max_per_img=600,
+                nms=dict(type='nms', iou_threshold=0.7),
+                min_bbox_size=0),
+            rcnn=dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=256,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)),
+        test_cfg=dict(
+            rpn=dict(
+                nms_pre=6000,
+                max_per_img=300,
+                nms=dict(type='nms', iou_threshold=0.7),
+                min_bbox_size=0),
+            rcnn=dict(
+                score_thr=0.0001,
+                nms=dict(type='nms', iou_threshold=0.5),
+                max_per_img=100))
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
diff --git a/tests/data/configs_mmtrack/faster_rcnn_r50_fpn.py b/tests/data/configs_mmtrack/faster_rcnn_r50_fpn.py
new file mode 100755
index 0000000..09de216
--- /dev/null
+++ b/tests/data/configs_mmtrack/faster_rcnn_r50_fpn.py
@@ -0,0 +1,109 @@
+model = dict(
+    detector=dict(
+        type='FasterRCNN',
+        backbone=dict(
+            type='ResNet',
+            depth=18,
+            base_channels=2,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            style='pytorch'),
+        neck=dict(
+            type='FPN', in_channels=[2, 4, 8, 16], out_channels=16,
+            num_outs=5),
+        rpn_head=dict(
+            type='RPNHead',
+            in_channels=16,
+            feat_channels=16,
+            anchor_generator=dict(
+                type='AnchorGenerator',
+                scales=[8],
+                ratios=[0.5, 1.0, 2.0],
+                strides=[4, 8, 16, 32, 64]),
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox=dict(
+                type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+        roi_head=dict(
+            type='StandardRoIHead',
+            bbox_roi_extractor=dict(
+                type='SingleRoIExtractor',
+                roi_layer=dict(
+                    type='RoIAlign', output_size=7, sampling_ratio=0),
+                out_channels=16,
+                featmap_strides=[4, 8, 16, 32]),
+            bbox_head=dict(
+                type='Shared2FCBBoxHead',
+                in_channels=16,
+                fc_out_channels=32,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),
+        train_cfg=dict(
+            rpn=dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    match_low_quality=True,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=256,
+                    pos_fraction=0.5,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=False),
+                allowed_border=-1,
+                pos_weight=-1,
+                debug=False),
+            rpn_proposal=dict(
+                nms_pre=2000,
+                max_per_img=1000,
+                nms=dict(type='nms', iou_threshold=0.7),
+                min_bbox_size=0),
+            rcnn=dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)),
+        test_cfg=dict(
+            rpn=dict(
+                nms_pre=1000,
+                max_per_img=1000,
+                nms=dict(type='nms', iou_threshold=0.7),
+                min_bbox_size=0),
+            rcnn=dict(
+                score_thr=0.05,
+                nms=dict(type='nms', iou_threshold=0.5),
+                max_per_img=100))
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
diff --git a/tests/data/configs_mmtrack/mot_challenge.py b/tests/data/configs_mmtrack/mot_challenge.py
new file mode 100755
index 0000000..362a979
--- /dev/null
+++ b/tests/data/configs_mmtrack/mot_challenge.py
@@ -0,0 +1,74 @@
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadMultiImagesFromFile', to_float32=True),
+    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+    dict(
+        type='SeqResize',
+        img_scale=(1088, 1088),
+        share_params=True,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True,
+        bbox_clip_border=False),
+    dict(type='SeqPhotoMetricDistortion', share_params=True),
+    dict(
+        type='SeqRandomCrop',
+        share_params=False,
+        crop_size=(1088, 1088),
+        bbox_clip_border=False),
+    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+    dict(type='SeqNormalize', **img_norm_cfg),
+    dict(type='SeqPad', size_divisor=32),
+    dict(type='MatchInstances', skip_nomatch=True),
+    dict(
+        type='VideoCollect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+            'gt_instance_ids'
+        ]),
+    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1088, 1088),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='VideoCollect', keys=['img'])
+        ])
+]
+data_root = 'data/MOT17/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        visibility_thr=-1,
+        ann_file=data_root + 'annotations/half-train_cocoformat.json',
+        img_prefix=data_root + 'train',
+        ref_img_sampler=dict(
+            num_ref_imgs=1,
+            frame_range=10,
+            filter_key_img=True,
+            method='uniform'),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/half-val_cocoformat.json',
+        img_prefix=data_root + 'train',
+        ref_img_sampler=None,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/half-val_cocoformat.json',
+        img_prefix=data_root + 'train',
+        ref_img_sampler=None,
+        pipeline=test_pipeline))
diff --git a/tests/data/configs_mmtrack/selsa_faster_rcnn_r101_dc5_1x.py b/tests/data/configs_mmtrack/selsa_faster_rcnn_r101_dc5_1x.py
new file mode 100755
index 0000000..a0109d7
--- /dev/null
+++ b/tests/data/configs_mmtrack/selsa_faster_rcnn_r101_dc5_1x.py
@@ -0,0 +1,48 @@
+_base_ = [
+    './faster_rcnn_r50_dc5.py', './mot_challenge.py',
+    '../../../configs/_base_/default_runtime.py'
+]
+model = dict(
+    type='SELSA',
+    pretrains=None,
+    detector=dict(
+        backbone=dict(depth=18, base_channels=2),
+        roi_head=dict(
+            type='SelsaRoIHead',
+            bbox_head=dict(
+                type='SelsaBBoxHead',
+                num_shared_fcs=2,
+                aggregator=dict(
+                    type='SelsaAggregator',
+                    in_channels=32,
+                    num_attention_blocks=16)))))
+
+# dataset settings
+data = dict(
+    val=dict(
+        ref_img_sampler=dict(
+            _delete_=True,
+            num_ref_imgs=14,
+            frame_range=[-7, 7],
+            method='test_with_adaptive_stride')),
+    test=dict(
+        ref_img_sampler=dict(
+            _delete_=True,
+            num_ref_imgs=14,
+            frame_range=[-7, 7],
+            method='test_with_adaptive_stride')))
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[2, 5])
+# runtime settings
+total_epochs = 7
+evaluation = dict(metric=['bbox'], interval=7)
diff --git a/tests/data/configs_mmtrack/tracktor_faster-rcnn_r50_fpn_4e.py b/tests/data/configs_mmtrack/tracktor_faster-rcnn_r50_fpn_4e.py
new file mode 100755
index 0000000..e7d6111
--- /dev/null
+++ b/tests/data/configs_mmtrack/tracktor_faster-rcnn_r50_fpn_4e.py
@@ -0,0 +1,70 @@
+_base_ = [
+    './faster_rcnn_r50_fpn.py', './mot_challenge.py',
+    '../../../configs/_base_/default_runtime.py'
+]
+model = dict(
+    type='Tracktor',
+    pretrains=dict(
+        detector=  # noqa: E251
+        'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth',  # noqa: E501
+        reid=  # noqa: E251
+        'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth'  # noqa: E501
+    ),
+    detector=dict(
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(
+                clip_border=False), num_classes=1))),
+    reid=dict(
+        type='BaseReID',
+        backbone=dict(
+            type='ResNet',
+            depth=18,
+            base_channels=2,
+            num_stages=4,
+            out_indices=(3, ),
+            style='pytorch'),
+        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+        head=dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=16,
+            fc_channels=32,
+            out_channels=16,
+            num_classes=8,
+            loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+            loss_pairwise=dict(
+                type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU'))),
+    motion=dict(
+        type='CameraMotionCompensation',
+        warp_mode='cv2.MOTION_EUCLIDEAN',
+        num_iters=100,
+        stop_eps=0.00001),
+    tracker=dict(
+        type='TracktorTracker',
+        obj_score_thr=0.5,
+        regression=dict(
+            obj_score_thr=0.5,
+            nms=dict(type='nms', iou_threshold=0.6),
+            match_iou_thr=0.3),
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0,
+            match_iou_thr=0.2),
+        momentums=None,
+        num_frames_retain=10))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=100,
+    warmup_ratio=1.0 / 100,
+    step=[3])
+# runtime settings
+total_epochs = 4
+evaluation = dict(metric=['bbox', 'track'], interval=1)
+search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
diff --git a/tests/data/custom_dataset/images/000001.jpg b/tests/data/custom_dataset/images/000001.jpg
new file mode 100755
index 0000000..8f96ee5
Binary files /dev/null and b/tests/data/custom_dataset/images/000001.jpg differ
diff --git a/tests/data/custom_dataset/images/000001.xml b/tests/data/custom_dataset/images/000001.xml
new file mode 100755
index 0000000..795d398
--- /dev/null
+++ b/tests/data/custom_dataset/images/000001.xml
@@ -0,0 +1,44 @@
+<annotation>
+	<folder>VOC2007</folder>
+	<filename>000001.jpg</filename>
+	<source>
+		<database>The VOC2007 Database</database>
+		<annotation>PASCAL VOC2007</annotation>
+		<image>flickr</image>
+		<flickrid>341012865</flickrid>
+	</source>
+	<owner>
+		<flickrid>Fried Camels</flickrid>
+		<name>Jinky the Fruit Bat</name>
+	</owner>
+	<size>
+		<width>353</width>
+		<height>500</height>
+		<depth>3</depth>
+	</size>
+	<segmented>0</segmented>
+	<object>
+		<name>dog</name>
+		<pose>Left</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>48</xmin>
+			<ymin>240</ymin>
+			<xmax>195</xmax>
+			<ymax>371</ymax>
+		</bndbox>
+	</object>
+	<object>
+		<name>person</name>
+		<pose>Left</pose>
+		<truncated>1</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>8</xmin>
+			<ymin>12</ymin>
+			<xmax>352</xmax>
+			<ymax>498</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/tests/data/custom_dataset/test.txt b/tests/data/custom_dataset/test.txt
new file mode 100755
index 0000000..a12b836
--- /dev/null
+++ b/tests/data/custom_dataset/test.txt
@@ -0,0 +1 @@
+000001
diff --git a/tests/data/custom_dataset/trainval.txt b/tests/data/custom_dataset/trainval.txt
new file mode 100755
index 0000000..a12b836
--- /dev/null
+++ b/tests/data/custom_dataset/trainval.txt
@@ -0,0 +1 @@
+000001
diff --git a/tests/data/gray.jpg b/tests/data/gray.jpg
new file mode 100755
index 0000000..94edd73
Binary files /dev/null and b/tests/data/gray.jpg differ
diff --git a/tests/test_data/test_datasets/test_coco_dataset.py b/tests/test_data/test_datasets/test_coco_dataset.py
new file mode 100755
index 0000000..77edfdf
--- /dev/null
+++ b/tests/test_data/test_datasets/test_coco_dataset.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+
+import mmcv
+import pytest
+
+from mmdet.datasets import CocoDataset
+
+
+def _create_ids_error_coco_json(json_name):
+    image = {
+        'id': 0,
+        'width': 640,
+        'height': 640,
+        'file_name': 'fake_name.jpg',
+    }
+
+    annotation_1 = {
+        'id': 1,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 400,
+        'bbox': [50, 60, 20, 20],
+        'iscrowd': 0,
+    }
+
+    annotation_2 = {
+        'id': 1,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 900,
+        'bbox': [100, 120, 30, 30],
+        'iscrowd': 0,
+    }
+
+    categories = [{
+        'id': 0,
+        'name': 'car',
+        'supercategory': 'car',
+    }]
+
+    fake_json = {
+        'images': [image],
+        'annotations': [annotation_1, annotation_2],
+        'categories': categories
+    }
+    mmcv.dump(fake_json, json_name)
+
+
+def test_coco_annotation_ids_unique():
+    tmp_dir = tempfile.TemporaryDirectory()
+    fake_json_file = osp.join(tmp_dir.name, 'fake_data.json')
+    _create_ids_error_coco_json(fake_json_file)
+
+    # test annotation ids not unique error
+    with pytest.raises(AssertionError):
+        CocoDataset(ann_file=fake_json_file, classes=('car', ), pipeline=[])
diff --git a/tests/test_data/test_datasets/test_coco_occluded.py b/tests/test_data/test_datasets/test_coco_occluded.py
new file mode 100755
index 0000000..8e612d0
--- /dev/null
+++ b/tests/test_data/test_datasets/test_coco_occluded.py
@@ -0,0 +1,38 @@
+import os.path as osp
+from tempfile import TemporaryDirectory
+
+import mmcv
+import numpy as np
+
+from mmdet.datasets import OccludedSeparatedCocoDataset
+
+
+def test_occluded_separated_coco_dataset():
+    ann = [[
+        'fake1.jpg', 'person', 8, [219.9, 176.12, 11.14, 34.23], {
+            'size': [480, 640],
+            'counts': b'nYW31n>2N2FNbA48Kf=?XBDe=m0OM3M4YOPB8_>L4JXao5'
+        }
+    ]] * 3
+    dummy_mask = np.zeros((10, 10), dtype=np.uint8)
+    dummy_mask[:5, :5] = 1
+    rle = {
+        'size': [480, 640],
+        'counts': b'nYW31n>2N2FNbA48Kf=?XBDe=m0OM3M4YOPB8_>L4JXao5'
+    }
+    res = [([np.array([[50, 60, 70, 80, 0.77]])] * 2, [[rle]] * 2)] * 3
+
+    tempdir = TemporaryDirectory()
+    ann_path = osp.join(tempdir.name, 'coco_occluded.pkl')
+    mmcv.dump(ann, ann_path)
+
+    dataset = OccludedSeparatedCocoDataset(
+        ann_file='tests/data/coco_sample.json',
+        occluded_ann=ann_path,
+        separated_ann=ann_path,
+        pipeline=[],
+        test_mode=True)
+    eval_res = dataset.evaluate(res)
+    assert isinstance(eval_res, dict)
+    assert eval_res['occluded_recall'] == 100
+    assert eval_res['separated_recall'] == 100
diff --git a/tests/test_data/test_datasets/test_common.py b/tests/test_data/test_datasets/test_common.py
new file mode 100755
index 0000000..e3070da
--- /dev/null
+++ b/tests/test_data/test_datasets/test_common.py
@@ -0,0 +1,369 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os.path as osp
+import tempfile
+from unittest.mock import MagicMock, patch
+
+import mmcv
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+from mmcv.runner import EpochBasedRunner
+from torch.utils.data import DataLoader
+
+from mmdet.core.evaluation import DistEvalHook, EvalHook
+from mmdet.datasets import DATASETS, CocoDataset, CustomDataset, build_dataset
+
+
+def _create_dummy_coco_json(json_name):
+    image = {
+        'id': 0,
+        'width': 640,
+        'height': 640,
+        'file_name': 'fake_name.jpg',
+    }
+
+    annotation_1 = {
+        'id': 1,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 400,
+        'bbox': [50, 60, 20, 20],
+        'iscrowd': 0,
+    }
+
+    annotation_2 = {
+        'id': 2,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 900,
+        'bbox': [100, 120, 30, 30],
+        'iscrowd': 0,
+    }
+
+    annotation_3 = {
+        'id': 3,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 1600,
+        'bbox': [150, 160, 40, 40],
+        'iscrowd': 0,
+    }
+
+    annotation_4 = {
+        'id': 4,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 10000,
+        'bbox': [250, 260, 100, 100],
+        'iscrowd': 0,
+    }
+
+    categories = [{
+        'id': 0,
+        'name': 'car',
+        'supercategory': 'car',
+    }]
+
+    fake_json = {
+        'images': [image],
+        'annotations':
+        [annotation_1, annotation_2, annotation_3, annotation_4],
+        'categories': categories
+    }
+
+    mmcv.dump(fake_json, json_name)
+
+
+def _create_dummy_custom_pkl(pkl_name):
+    fake_pkl = [{
+        'filename': 'fake_name.jpg',
+        'width': 640,
+        'height': 640,
+        'ann': {
+            'bboxes':
+            np.array([[50, 60, 70, 80], [100, 120, 130, 150],
+                      [150, 160, 190, 200], [250, 260, 350, 360]]),
+            'labels':
+            np.array([0, 0, 0, 0])
+        }
+    }]
+    mmcv.dump(fake_pkl, pkl_name)
+
+
+def _create_dummy_results():
+    boxes = [
+        np.array([[50, 60, 70, 80, 1.0], [100, 120, 130, 150, 0.98],
+                  [150, 160, 190, 200, 0.96], [250, 260, 350, 360, 0.95]])
+    ]
+    return [boxes]
+
+
+@pytest.mark.parametrize('config_path',
+                         ['./configs/_base_/datasets/voc0712.py'])
+def test_dataset_init(config_path, monkeypatch):
+    data_config = mmcv.Config.fromfile(config_path)
+    if 'data' not in data_config:
+        return
+
+    monkeypatch.chdir('./tests/')  # to use ./tests/data
+    stage_names = ['train', 'val', 'test']
+    for stage_name in stage_names:
+        dataset_config = copy.deepcopy(data_config.data.get(stage_name))
+        dataset = build_dataset(dataset_config)
+        dataset[0]
+
+
+def test_dataset_evaluation():
+    tmp_dir = tempfile.TemporaryDirectory()
+    # create dummy data
+    fake_json_file = osp.join(tmp_dir.name, 'fake_data.json')
+    _create_dummy_coco_json(fake_json_file)
+
+    # test single coco dataset evaluation
+    coco_dataset = CocoDataset(
+        ann_file=fake_json_file, classes=('car', ), pipeline=[])
+    fake_results = _create_dummy_results()
+    eval_results = coco_dataset.evaluate(fake_results, classwise=True)
+    assert eval_results['bbox_mAP'] == 1
+    assert eval_results['bbox_mAP_50'] == 1
+    assert eval_results['bbox_mAP_75'] == 1
+
+    # test concat dataset evaluation
+    fake_concat_results = _create_dummy_results() + _create_dummy_results()
+
+    # build concat dataset through two config dict
+    coco_cfg = dict(
+        type='CocoDataset',
+        ann_file=fake_json_file,
+        classes=('car', ),
+        pipeline=[])
+    concat_cfgs = [coco_cfg, coco_cfg]
+    concat_dataset = build_dataset(concat_cfgs)
+    eval_results = concat_dataset.evaluate(fake_concat_results)
+    assert eval_results['0_bbox_mAP'] == 1
+    assert eval_results['0_bbox_mAP_50'] == 1
+    assert eval_results['0_bbox_mAP_75'] == 1
+    assert eval_results['1_bbox_mAP'] == 1
+    assert eval_results['1_bbox_mAP_50'] == 1
+    assert eval_results['1_bbox_mAP_75'] == 1
+
+    # build concat dataset through concatenated ann_file
+    coco_cfg = dict(
+        type='CocoDataset',
+        ann_file=[fake_json_file, fake_json_file],
+        classes=('car', ),
+        pipeline=[])
+    concat_dataset = build_dataset(coco_cfg)
+    eval_results = concat_dataset.evaluate(fake_concat_results)
+    assert eval_results['0_bbox_mAP'] == 1
+    assert eval_results['0_bbox_mAP_50'] == 1
+    assert eval_results['0_bbox_mAP_75'] == 1
+    assert eval_results['1_bbox_mAP'] == 1
+    assert eval_results['1_bbox_mAP_50'] == 1
+    assert eval_results['1_bbox_mAP_75'] == 1
+
+    # create dummy data
+    fake_pkl_file = osp.join(tmp_dir.name, 'fake_data.pkl')
+    _create_dummy_custom_pkl(fake_pkl_file)
+
+    # test single custom dataset evaluation
+    custom_dataset = CustomDataset(
+        ann_file=fake_pkl_file, classes=('car', ), pipeline=[])
+    fake_results = _create_dummy_results()
+    eval_results = custom_dataset.evaluate(fake_results)
+    assert eval_results['mAP'] == 1
+
+    # test concat dataset evaluation
+    fake_concat_results = _create_dummy_results() + _create_dummy_results()
+
+    # build concat dataset through two config dict
+    custom_cfg = dict(
+        type='CustomDataset',
+        ann_file=fake_pkl_file,
+        classes=('car', ),
+        pipeline=[])
+    concat_cfgs = [custom_cfg, custom_cfg]
+    concat_dataset = build_dataset(concat_cfgs)
+    eval_results = concat_dataset.evaluate(fake_concat_results)
+    assert eval_results['0_mAP'] == 1
+    assert eval_results['1_mAP'] == 1
+
+    # build concat dataset through concatenated ann_file
+    concat_cfg = dict(
+        type='CustomDataset',
+        ann_file=[fake_pkl_file, fake_pkl_file],
+        classes=('car', ),
+        pipeline=[])
+    concat_dataset = build_dataset(concat_cfg)
+    eval_results = concat_dataset.evaluate(fake_concat_results)
+    assert eval_results['0_mAP'] == 1
+    assert eval_results['1_mAP'] == 1
+
+    # build concat dataset through explicit type
+    concat_cfg = dict(
+        type='ConcatDataset',
+        datasets=[custom_cfg, custom_cfg],
+        separate_eval=False)
+    concat_dataset = build_dataset(concat_cfg)
+    eval_results = concat_dataset.evaluate(fake_concat_results, metric='mAP')
+    assert eval_results['mAP'] == 1
+    assert len(concat_dataset.datasets[0].data_infos) == \
+        len(concat_dataset.datasets[1].data_infos)
+    assert len(concat_dataset.datasets[0].data_infos) == 1
+    tmp_dir.cleanup()
+
+
+@patch('mmdet.apis.single_gpu_test', MagicMock)
+@patch('mmdet.apis.multi_gpu_test', MagicMock)
+@pytest.mark.parametrize('EvalHookParam', (EvalHook, DistEvalHook))
+def test_evaluation_hook(EvalHookParam):
+    # create dummy data
+    dataloader = DataLoader(torch.ones((5, 2)))
+
+    # 0.1. dataloader is not a DataLoader object
+    with pytest.raises(TypeError):
+        EvalHookParam(dataloader=MagicMock(), interval=-1)
+
+    # 0.2. negative interval
+    with pytest.raises(ValueError):
+        EvalHookParam(dataloader, interval=-1)
+
+    # 1. start=None, interval=1: perform evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, interval=1)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
+
+    # 2. start=1, interval=1: perform evaluation after each epoch.
+    runner = _build_demo_runner()
+
+    evalhook = EvalHookParam(dataloader, start=1, interval=1)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
+
+    # 3. start=None, interval=2: perform evaluation after epoch 2, 4, 6, etc
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, interval=2)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 1  # after epoch 2
+
+    # 4. start=1, interval=2: perform evaluation after epoch 1, 3, 5, etc
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=1, interval=2)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 3
+
+    # 5. start=0/negative, interval=1: perform evaluation after each epoch and
+    #    before epoch 1.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=0)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 3  # before epoch1 and after e1 & e2
+
+    # 6. start=0, interval=2, dynamic_intervals=[(3, 1)]: the evaluation
+    # interval is 2 when it is less than 3 epoch, otherwise it is 1.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(
+        dataloader, start=0, interval=2, dynamic_intervals=[(3, 1)])
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 4)
+    assert evalhook.evaluate.call_count == 3
+
+    # the evaluation start epoch cannot be less than 0
+    runner = _build_demo_runner()
+    with pytest.raises(ValueError):
+        EvalHookParam(dataloader, start=-2)
+
+    evalhook = EvalHookParam(dataloader, start=0)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 3  # before epoch1 and after e1 & e2
+
+    # 6. resuming from epoch i, start = x (x<=i), interval =1: perform
+    #    evaluation after each epoch and before the first epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=1)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner._epoch = 2
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # before & after epoch 3
+
+    # 7. resuming from epoch i, start = i+1/None, interval =1: perform
+    #    evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=2)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner._epoch = 1
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # after epoch 2 & 3
+
+
+def _build_demo_runner():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(2, 1)
+
+        def forward(self, x):
+            return self.linear(x)
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+    model = Model()
+    tmp_dir = tempfile.mkdtemp()
+
+    runner = EpochBasedRunner(
+        model=model, work_dir=tmp_dir, logger=logging.getLogger())
+    return runner
+
+
+@pytest.mark.parametrize('classes, expected_length', [(['bus'], 2),
+                                                      (['car'], 1),
+                                                      (['bus', 'car'], 2)])
+def test_allow_empty_images(classes, expected_length):
+    dataset_class = DATASETS.get('CocoDataset')
+    # Filter empty images
+    filtered_dataset = dataset_class(
+        ann_file='tests/data/coco_sample.json',
+        img_prefix='tests/data',
+        pipeline=[],
+        classes=classes,
+        filter_empty_gt=True)
+
+    # Get all
+    full_dataset = dataset_class(
+        ann_file='tests/data/coco_sample.json',
+        img_prefix='tests/data',
+        pipeline=[],
+        classes=classes,
+        filter_empty_gt=False)
+
+    assert len(filtered_dataset) == expected_length
+    assert len(filtered_dataset.img_ids) == expected_length
+    assert len(full_dataset) == 3
+    assert len(full_dataset.img_ids) == 3
+    assert filtered_dataset.CLASSES == classes
+    assert full_dataset.CLASSES == classes
diff --git a/tests/test_data/test_datasets/test_custom_dataset.py b/tests/test_data/test_datasets/test_custom_dataset.py
new file mode 100755
index 0000000..4dae464
--- /dev/null
+++ b/tests/test_data/test_datasets/test_custom_dataset.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import unittest
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from mmdet.datasets import DATASETS
+
+
+@patch('mmdet.datasets.CocoDataset.load_annotations', MagicMock())
+@patch('mmdet.datasets.CustomDataset.load_annotations', MagicMock())
+@patch('mmdet.datasets.XMLDataset.load_annotations', MagicMock())
+@patch('mmdet.datasets.CityscapesDataset.load_annotations', MagicMock())
+@patch('mmdet.datasets.CocoDataset._filter_imgs', MagicMock)
+@patch('mmdet.datasets.CustomDataset._filter_imgs', MagicMock)
+@patch('mmdet.datasets.XMLDataset._filter_imgs', MagicMock)
+@patch('mmdet.datasets.CityscapesDataset._filter_imgs', MagicMock)
+@pytest.mark.parametrize('dataset',
+                         ['CocoDataset', 'VOCDataset', 'CityscapesDataset'])
+def test_custom_classes_override_default(dataset):
+    dataset_class = DATASETS.get(dataset)
+    if dataset in ['CocoDataset', 'CityscapesDataset']:
+        dataset_class.coco = MagicMock()
+        dataset_class.cat_ids = MagicMock()
+
+    original_classes = dataset_class.CLASSES
+
+    # Test setting classes as a tuple
+    custom_dataset = dataset_class(
+        ann_file=MagicMock(),
+        pipeline=[],
+        classes=('bus', 'car'),
+        test_mode=True,
+        img_prefix='VOC2007' if dataset == 'VOCDataset' else '')
+
+    assert custom_dataset.CLASSES != original_classes
+    assert custom_dataset.CLASSES == ('bus', 'car')
+    print(custom_dataset)
+
+    # Test setting classes as a list
+    custom_dataset = dataset_class(
+        ann_file=MagicMock(),
+        pipeline=[],
+        classes=['bus', 'car'],
+        test_mode=True,
+        img_prefix='VOC2007' if dataset == 'VOCDataset' else '')
+
+    assert custom_dataset.CLASSES != original_classes
+    assert custom_dataset.CLASSES == ['bus', 'car']
+    print(custom_dataset)
+
+    # Test overriding not a subset
+    custom_dataset = dataset_class(
+        ann_file=MagicMock(),
+        pipeline=[],
+        classes=['foo'],
+        test_mode=True,
+        img_prefix='VOC2007' if dataset == 'VOCDataset' else '')
+
+    assert custom_dataset.CLASSES != original_classes
+    assert custom_dataset.CLASSES == ['foo']
+    print(custom_dataset)
+
+    # Test default behavior
+    custom_dataset = dataset_class(
+        ann_file=MagicMock(),
+        pipeline=[],
+        classes=None,
+        test_mode=True,
+        img_prefix='VOC2007' if dataset == 'VOCDataset' else '')
+
+    assert custom_dataset.CLASSES == original_classes
+    print(custom_dataset)
+
+    # Test sending file path
+    import tempfile
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = tmpdir + 'classes.txt'
+        with open(path, 'w') as f:
+            f.write('bus\ncar\n')
+    custom_dataset = dataset_class(
+        ann_file=MagicMock(),
+        pipeline=[],
+        classes=path,
+        test_mode=True,
+        img_prefix='VOC2007' if dataset == 'VOCDataset' else '')
+
+    assert custom_dataset.CLASSES != original_classes
+    assert custom_dataset.CLASSES == ['bus', 'car']
+    print(custom_dataset)
+
+
+class CustomDatasetTests(unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.data_dir = osp.join(
+            osp.dirname(osp.dirname(osp.dirname(__file__))), 'data')
+        self.dataset_class = DATASETS.get('XMLDataset')
+
+    def test_data_infos__default_db_directories(self):
+        """Test correct data read having a Pacal-VOC directory structure."""
+        test_dataset_root = osp.join(self.data_dir, 'VOCdevkit', 'VOC2007')
+        custom_ds = self.dataset_class(
+            data_root=test_dataset_root,
+            ann_file=osp.join(test_dataset_root, 'ImageSets', 'Main',
+                              'trainval.txt'),
+            pipeline=[],
+            classes=('person', 'dog'),
+            test_mode=True)
+
+        self.assertListEqual([{
+            'id': '000001',
+            'filename': osp.join('JPEGImages', '000001.jpg'),
+            'width': 353,
+            'height': 500
+        }], custom_ds.data_infos)
+
+    def test_data_infos__overridden_db_subdirectories(self):
+        """Test correct data read having a customized directory structure."""
+        test_dataset_root = osp.join(self.data_dir, 'custom_dataset')
+        custom_ds = self.dataset_class(
+            data_root=test_dataset_root,
+            ann_file=osp.join(test_dataset_root, 'trainval.txt'),
+            pipeline=[],
+            classes=('person', 'dog'),
+            test_mode=True,
+            img_prefix='',
+            img_subdir='images',
+            ann_subdir='images')
+
+        self.assertListEqual([{
+            'id': '000001',
+            'filename': osp.join('images', '000001.jpg'),
+            'width': 353,
+            'height': 500
+        }], custom_ds.data_infos)
diff --git a/tests/test_data/test_datasets/test_dataset_wrapper.py b/tests/test_data/test_datasets/test_dataset_wrapper.py
new file mode 100755
index 0000000..ad29678
--- /dev/null
+++ b/tests/test_data/test_datasets/test_dataset_wrapper.py
@@ -0,0 +1,209 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import math
+from collections import defaultdict
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+from mmdet.datasets import (ClassBalancedDataset, ConcatDataset, CustomDataset,
+                            MultiImageMixDataset, RepeatDataset)
+
+
+def test_dataset_wrapper():
+    CustomDataset.load_annotations = MagicMock()
+    CustomDataset.__getitem__ = MagicMock(side_effect=lambda idx: idx)
+    dataset_a = CustomDataset(
+        ann_file=MagicMock(), pipeline=[], test_mode=True, img_prefix='')
+    len_a = 10
+    cat_ids_list_a = [
+        np.random.randint(0, 80, num).tolist()
+        for num in np.random.randint(1, 20, len_a)
+    ]
+    ann_info_list_a = []
+    for _ in range(len_a):
+        height = np.random.randint(10, 30)
+        weight = np.random.randint(10, 30)
+        img = np.ones((height, weight, 3))
+        gt_bbox = np.concatenate([
+            np.random.randint(1, 5, (2, 2)),
+            np.random.randint(1, 5, (2, 2)) + 5
+        ],
+                                 axis=1)
+        gt_labels = np.random.randint(0, 80, 2)
+        ann_info_list_a.append(
+            dict(gt_bboxes=gt_bbox, gt_labels=gt_labels, img=img))
+    dataset_a.data_infos = MagicMock()
+    dataset_a.data_infos.__len__.return_value = len_a
+    dataset_a.get_cat_ids = MagicMock(
+        side_effect=lambda idx: cat_ids_list_a[idx])
+    dataset_a.get_ann_info = MagicMock(
+        side_effect=lambda idx: ann_info_list_a[idx])
+    dataset_b = CustomDataset(
+        ann_file=MagicMock(), pipeline=[], test_mode=True, img_prefix='')
+    len_b = 20
+    cat_ids_list_b = [
+        np.random.randint(0, 80, num).tolist()
+        for num in np.random.randint(1, 20, len_b)
+    ]
+    ann_info_list_b = []
+    for _ in range(len_b):
+        height = np.random.randint(10, 30)
+        weight = np.random.randint(10, 30)
+        img = np.ones((height, weight, 3))
+        gt_bbox = np.concatenate([
+            np.random.randint(1, 5, (2, 2)),
+            np.random.randint(1, 5, (2, 2)) + 5
+        ],
+                                 axis=1)
+        gt_labels = np.random.randint(0, 80, 2)
+        ann_info_list_b.append(
+            dict(gt_bboxes=gt_bbox, gt_labels=gt_labels, img=img))
+    dataset_b.data_infos = MagicMock()
+    dataset_b.data_infos.__len__.return_value = len_b
+    dataset_b.get_cat_ids = MagicMock(
+        side_effect=lambda idx: cat_ids_list_b[idx])
+    dataset_b.get_ann_info = MagicMock(
+        side_effect=lambda idx: ann_info_list_b[idx])
+
+    concat_dataset = ConcatDataset([dataset_a, dataset_b])
+    assert concat_dataset[5] == 5
+    assert concat_dataset[25] == 15
+    assert concat_dataset.get_cat_ids(5) == cat_ids_list_a[5]
+    assert concat_dataset.get_cat_ids(25) == cat_ids_list_b[15]
+    assert concat_dataset.get_ann_info(5) == ann_info_list_a[5]
+    assert concat_dataset.get_ann_info(25) == ann_info_list_b[15]
+    assert len(concat_dataset) == len(dataset_a) + len(dataset_b)
+
+    # Test if ConcatDataset allows dataset classes without the PALETTE
+    # attribute
+    palette_backup = CustomDataset.PALETTE
+    delattr(CustomDataset, 'PALETTE')
+    concat_dataset = ConcatDataset([dataset_a, dataset_b])
+    assert concat_dataset.PALETTE is None
+    CustomDataset.PALETTE = palette_backup
+
+    repeat_dataset = RepeatDataset(dataset_a, 10)
+    assert repeat_dataset[5] == 5
+    assert repeat_dataset[15] == 5
+    assert repeat_dataset[27] == 7
+    assert repeat_dataset.get_cat_ids(5) == cat_ids_list_a[5]
+    assert repeat_dataset.get_cat_ids(15) == cat_ids_list_a[5]
+    assert repeat_dataset.get_cat_ids(27) == cat_ids_list_a[7]
+    assert repeat_dataset.get_ann_info(5) == ann_info_list_a[5]
+    assert repeat_dataset.get_ann_info(15) == ann_info_list_a[5]
+    assert repeat_dataset.get_ann_info(27) == ann_info_list_a[7]
+    assert len(repeat_dataset) == 10 * len(dataset_a)
+
+    # Test if RepeatDataset allows dataset classes without the PALETTE
+    # attribute
+    delattr(CustomDataset, 'PALETTE')
+    repeat_dataset = RepeatDataset(dataset_a, 10)
+    assert repeat_dataset.PALETTE is None
+    CustomDataset.PALETTE = palette_backup
+
+    category_freq = defaultdict(int)
+    for cat_ids in cat_ids_list_a:
+        cat_ids = set(cat_ids)
+        for cat_id in cat_ids:
+            category_freq[cat_id] += 1
+    for k, v in category_freq.items():
+        category_freq[k] = v / len(cat_ids_list_a)
+
+    mean_freq = np.mean(list(category_freq.values()))
+    repeat_thr = mean_freq
+
+    category_repeat = {
+        cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+        for cat_id, cat_freq in category_freq.items()
+    }
+
+    repeat_factors = []
+    for cat_ids in cat_ids_list_a:
+        cat_ids = set(cat_ids)
+        repeat_factor = max({category_repeat[cat_id] for cat_id in cat_ids})
+        repeat_factors.append(math.ceil(repeat_factor))
+    repeat_factors_cumsum = np.cumsum(repeat_factors)
+    repeat_factor_dataset = ClassBalancedDataset(dataset_a, repeat_thr)
+    assert len(repeat_factor_dataset) == repeat_factors_cumsum[-1]
+    for idx in np.random.randint(0, len(repeat_factor_dataset), 3):
+        assert repeat_factor_dataset[idx] == bisect.bisect_right(
+            repeat_factors_cumsum, idx)
+        assert repeat_factor_dataset.get_ann_info(idx) == ann_info_list_a[
+            bisect.bisect_right(repeat_factors_cumsum, idx)]
+    # Test if ClassBalancedDataset allows dataset classes without the PALETTE
+    # attribute
+    delattr(CustomDataset, 'PALETTE')
+    repeat_factor_dataset = ClassBalancedDataset(dataset_a, repeat_thr)
+    assert repeat_factor_dataset.PALETTE is None
+    CustomDataset.PALETTE = palette_backup
+
+    img_scale = (60, 60)
+    pipeline = [
+        dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+        dict(
+            type='RandomAffine',
+            scaling_ratio_range=(0.1, 2),
+            border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+        dict(
+            type='MixUp',
+            img_scale=img_scale,
+            ratio_range=(0.8, 1.6),
+            pad_val=114.0),
+        dict(type='RandomFlip', flip_ratio=0.5),
+        dict(type='Resize', img_scale=img_scale, keep_ratio=True),
+        dict(type='Pad', pad_to_square=True, pad_val=114.0),
+    ]
+
+    CustomDataset.load_annotations = MagicMock()
+    results = []
+    for _ in range(2):
+        height = np.random.randint(10, 30)
+        weight = np.random.randint(10, 30)
+        img = np.ones((height, weight, 3))
+        gt_bbox = np.concatenate([
+            np.random.randint(1, 5, (2, 2)),
+            np.random.randint(1, 5, (2, 2)) + 5
+        ],
+                                 axis=1)
+        gt_labels = np.random.randint(0, 80, 2)
+        results.append(dict(gt_bboxes=gt_bbox, gt_labels=gt_labels, img=img))
+
+    CustomDataset.__getitem__ = MagicMock(side_effect=lambda idx: results[idx])
+    dataset_a = CustomDataset(
+        ann_file=MagicMock(), pipeline=[], test_mode=True, img_prefix='')
+    len_a = 2
+    cat_ids_list_a = [
+        np.random.randint(0, 80, num).tolist()
+        for num in np.random.randint(1, 20, len_a)
+    ]
+    dataset_a.data_infos = MagicMock()
+    dataset_a.data_infos.__len__.return_value = len_a
+    dataset_a.get_cat_ids = MagicMock(
+        side_effect=lambda idx: cat_ids_list_a[idx])
+
+    # test dynamic_scale deprecated
+    with pytest.raises(RuntimeError):
+        MultiImageMixDataset(dataset_a, pipeline, (80, 80))
+
+    multi_image_mix_dataset = MultiImageMixDataset(dataset_a, pipeline)
+    for idx in range(len_a):
+        results_ = multi_image_mix_dataset[idx]
+        assert results_['img'].shape == (img_scale[0], img_scale[1], 3)
+
+    # test skip_type_keys
+    multi_image_mix_dataset = MultiImageMixDataset(
+        dataset_a,
+        pipeline,
+        skip_type_keys=('MixUp', 'RandomFlip', 'Resize', 'Pad'))
+    for idx in range(len_a):
+        results_ = multi_image_mix_dataset[idx]
+        assert results_['img'].shape == (img_scale[0], img_scale[1], 3)
+
+    # Test if MultiImageMixDataset allows dataset classes without the PALETTE
+    # attribute
+    delattr(CustomDataset, 'PALETTE')
+    multi_image_mix_dataset = MultiImageMixDataset(dataset_a, pipeline)
+    assert multi_image_mix_dataset.PALETTE is None
+    CustomDataset.PALETTE = palette_backup
diff --git a/tests/test_data/test_datasets/test_objects365.py b/tests/test_data/test_datasets/test_objects365.py
new file mode 100755
index 0000000..7445188
--- /dev/null
+++ b/tests/test_data/test_datasets/test_objects365.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+
+import mmcv
+import pytest
+
+from mmdet.datasets import Objects365V1Dataset, Objects365V2Dataset
+
+
+def _create_objects365_json(json_name):
+    images = [{
+        'file_name': 'fake1.jpg',
+        'height': 800,
+        'width': 800,
+        'id': 0
+    }, {
+        'file_name': 'fake2.jpg',
+        'height': 800,
+        'width': 800,
+        'id': 1
+    }, {
+        'file_name': 'patch16/objects365_v2_00908726.jpg',
+        'height': 800,
+        'width': 800,
+        'id': 2
+    }]
+
+    annotations = [{
+        'bbox': [0, 0, 20, 20],
+        'area': 400.00,
+        'score': 1.0,
+        'category_id': 1,
+        'id': 1,
+        'image_id': 0
+    }, {
+        'bbox': [0, 0, 20, 20],
+        'area': 400.00,
+        'score': 1.0,
+        'category_id': 2,
+        'id': 2,
+        'image_id': 0
+    }, {
+        'bbox': [0, 0, 20, 20],
+        'area': 400.00,
+        'score': 1.0,
+        'category_id': 1,
+        'id': 3,
+        'image_id': 1
+    }, {
+        'bbox': [0, 0, 20, 20],
+        'area': 400.00,
+        'score': 1.0,
+        'category_id': 1,
+        'id': 4,
+        'image_id': 2
+    }]
+
+    categories = [{
+        'id': 1,
+        'name': 'bus',
+        'supercategory': 'none'
+    }, {
+        'id': 2,
+        'name': 'car',
+        'supercategory': 'none'
+    }]
+
+    fake_json = {
+        'images': images,
+        'annotations': annotations,
+        'categories': categories
+    }
+    print(fake_json)
+    mmcv.dump(fake_json, json_name)
+
+
+def _create_ids_error_coco_json(json_name):
+    image = {
+        'id': 0,
+        'width': 640,
+        'height': 640,
+        'file_name': 'fake_name.jpg',
+    }
+
+    annotation_1 = {
+        'id': 1,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 400,
+        'bbox': [50, 60, 20, 20],
+        'iscrowd': 0,
+    }
+
+    annotation_2 = {
+        'id': 1,
+        'image_id': 0,
+        'category_id': 0,
+        'area': 900,
+        'bbox': [100, 120, 30, 30],
+        'iscrowd': 0,
+    }
+
+    categories = [{
+        'id': 0,
+        'name': 'car',
+        'supercategory': 'car',
+    }]
+
+    fake_json = {
+        'images': [image],
+        'annotations': [annotation_1, annotation_2],
+        'categories': categories
+    }
+    mmcv.dump(fake_json, json_name)
+
+
+@pytest.mark.parametrize('datasets',
+                         [Objects365V1Dataset, Objects365V2Dataset])
+def test_annotation_ids_unique(datasets):
+    tmp_dir = tempfile.TemporaryDirectory()
+    fake_json_file = osp.join(tmp_dir.name, 'fake_data.json')
+    _create_ids_error_coco_json(fake_json_file)
+
+    # test annotation ids not unique error
+    with pytest.raises(AssertionError):
+        datasets(ann_file=fake_json_file, classes=('car', ), pipeline=[])
+
+    tmp_dir.cleanup()
+
+
+def test_load_objects365v1_annotations():
+    tmp_dir = tempfile.TemporaryDirectory()
+    fake_json_file = osp.join(tmp_dir.name, 'fake_data.json')
+    _create_objects365_json(fake_json_file)
+
+    dataset = Objects365V1Dataset(
+        ann_file=fake_json_file, classes=('bus', 'car'), pipeline=[])
+
+    # The Objects365V1Dataset do not filter the `objv2_ignore_list`
+    assert len(dataset.data_infos) == 3
+    tmp_dir.cleanup()
+
+
+def test_load_objects365v2_annotations():
+    tmp_dir = tempfile.TemporaryDirectory()
+    fake_json_file = osp.join(tmp_dir.name, 'fake_data.json')
+    _create_objects365_json(fake_json_file)
+
+    dataset = Objects365V2Dataset(
+        ann_file=fake_json_file, classes=('bus', 'car'), pipeline=[])
+
+    # The Objects365V2Dataset need filter the `objv2_ignore_list`
+    assert len(dataset.data_infos) == 2
+    tmp_dir.cleanup()
diff --git a/tests/test_data/test_datasets/test_openimages_dataset.py b/tests/test_data/test_datasets/test_openimages_dataset.py
new file mode 100755
index 0000000..af87e96
--- /dev/null
+++ b/tests/test_data/test_datasets/test_openimages_dataset.py
@@ -0,0 +1,367 @@
+import csv
+import os.path as osp
+import tempfile
+
+import mmcv
+import numpy as np
+import pytest
+
+from mmdet.datasets import OpenImagesChallengeDataset, OpenImagesDataset
+
+
+def _create_ids_error_oid_csv(
+    label_file,
+    fake_csv_file,
+):
+    label_description = ['/m/000002', 'Football']
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_file, 'w', newline='') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(label_description)
+
+    header = [
+        'ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
+        'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
+        'IsInside'
+    ]
+    annotations = [[
+        'color', 'xclick', '/m/000002', '1', '0.022673031', '0.9642005',
+        '0.07103825', '0.80054647', '0', '0', '0', '0', '0'
+    ],
+                   [
+                       '000595fe6fee6369', 'xclick', '/m/000000', '1', '0',
+                       '1', '0', '1', '0', '0', '1', '0', '0'
+                   ]]
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(fake_csv_file, 'w', newline='') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+
+def _create_oid_style_ann(label_file, csv_file, label_level_file):
+    label_description = [['/m/000000', 'Sports equipment'],
+                         ['/m/000001', 'Ball'], ['/m/000002', 'Football'],
+                         ['/m/000004', 'Bicycle']]
+    with open(label_file, 'w', newline='') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerows(label_description)
+
+    header = [
+        'ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
+        'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
+        'IsInside'
+    ]
+    annotations = [
+        [
+            'color', 'xclick', '/m/000002', 1, 0.0333333, 0.1, 0.0333333, 0.1,
+            0, 0, 1, 0, 0
+        ],
+        [
+            'color', 'xclick', '/m/000002', 1, 0.1, 0.166667, 0.1, 0.166667, 0,
+            0, 0, 0, 0
+        ],
+    ]
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(csv_file, 'w', newline='') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+    header = ['ImageID', 'Source', 'LabelName', 'Confidence']
+    annotations = [['color', 'xclick', '/m/000002', '1'],
+                   ['color', 'xclick', '/m/000004', '0']]
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_level_file, 'w', newline='') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+
+def _create_hierarchy_json(hierarchy_name):
+    fake_hierarchy = \
+        {'LabelName':  '/m/0bl9f',      # entity label
+         'Subcategory': [
+             {
+                 'LabelName': '/m/000000',
+                 'Subcategory':
+                     [
+                         {'LabelName': '/m/000001',
+                          'Subcategory':
+                              [
+                                  {
+                                      'LabelName': '/m/000002'
+                                  }
+                              ]
+                          },
+                         {
+                             'LabelName': '/m/000004'
+                         }
+                     ]
+             }
+         ]
+         }
+
+    mmcv.dump(fake_hierarchy, hierarchy_name)
+
+
+def _create_hierarchy_np(hierarchy_name):
+    fake_hierarchy = np.array([[0, 1, 0, 0, 0], [0, 1, 1, 0,
+                                                 0], [0, 1, 1, 1, 0],
+                               [0, 1, 0, 0, 1], [0, 0, 0, 0, 0]])
+    with open(hierarchy_name, 'wb') as f:
+        np.save(f, fake_hierarchy)
+
+
+def _create_dummy_results():
+    boxes = [
+        np.zeros((0, 5)),
+        np.zeros((0, 5)),
+        np.array([[10, 10, 15, 15, 1.0], [15, 15, 30, 30, 0.98],
+                  [10, 10, 25, 25, 0.98], [28, 28, 35, 35, 0.97],
+                  [30, 30, 51, 51, 0.96], [100, 110, 120, 130, 0.15]]),
+        np.array([[30, 30, 50, 50, 0.51]]),
+    ]
+    return [boxes]
+
+
+def _creat_oid_challenge_style_ann(txt_file, label_file, label_level_file):
+    bboxes = [
+        'validation/color.jpg\n',
+        '4 29\n',
+        '2\n',
+        '1 0.0333333 0.1 0.0333333 0.1 1\n',
+        '1 0.1 0.166667 0.1 0.166667 0\n',
+    ]
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(txt_file, 'w', newline='') as f:
+        f.writelines(bboxes)
+        f.close()
+
+    label_description = [['/m/000000', 'Sports equipment', 1],
+                         ['/m/000001', 'Ball', 2],
+                         ['/m/000002', 'Football', 3],
+                         ['/m/000004', 'Bicycle', 4]]
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_file, 'w', newline='') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerows(label_description)
+
+    header = ['ImageID', 'LabelName', 'Confidence']
+    annotations = [['color', '/m/000001', '1'], ['color', '/m/000000', '0']]
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_level_file, 'w', newline='') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+
+def _create_metas(meta_file):
+
+    fake_meta = [{
+        'filename': 'data/OpenImages/OpenImages/validation/color.jpg',
+        'ori_shape': (300, 300, 3)
+    }]
+    mmcv.dump(fake_meta, meta_file)
+
+
+def test_oid_annotation_ids_unique():
+    # create fake ann files
+    tmp_dir = tempfile.TemporaryDirectory()
+    fake_label_file = osp.join(tmp_dir.name, 'fake_label.csv')
+    fake_ann_file = osp.join(tmp_dir.name, 'fake_ann.csv')
+    _create_ids_error_oid_csv(fake_label_file, fake_ann_file)
+
+    # test annotation ids not unique error
+    with pytest.raises(AssertionError):
+        OpenImagesDataset(
+            ann_file=fake_ann_file, label_file=fake_label_file, pipeline=[])
+    tmp_dir.cleanup()
+
+
+def test_openimages_dataset():
+    # create fake ann files
+    tmp_dir = tempfile.TemporaryDirectory()
+    label_file = osp.join(tmp_dir.name, 'label_file.csv')
+    ann_file = osp.join(tmp_dir.name, 'ann_file.csv')
+    label_level_file = osp.join(tmp_dir.name, 'label_level_file.csv')
+    _create_oid_style_ann(label_file, ann_file, label_level_file)
+
+    hierarchy_json = osp.join(tmp_dir.name, 'hierarchy.json')
+    _create_hierarchy_json(hierarchy_json)
+
+    # test whether hierarchy_file is not None when set
+    # get_parent_classes is True
+    with pytest.raises(AssertionError):
+        OpenImagesDataset(
+            ann_file=ann_file,
+            label_file=label_file,
+            image_level_ann_file=label_level_file,
+            pipeline=[])
+
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_json,
+        pipeline=[])
+    ann = dataset.get_ann_info(0)
+    # two legal detection bboxes with `group_of` parameter
+    assert ann['bboxes'].shape[0] == ann['labels'].shape[0] == \
+           ann['gt_is_group_ofs'].shape[0] == 2
+
+    # test load metas from pipeline
+    img_norm_cfg = dict(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True)
+    test_pipeline = [
+        dict(type='LoadImageFromFile'),
+        dict(
+            type='MultiScaleFlipAug',
+            img_scale=(128, 128),
+            flip=False,
+            transforms=[
+                dict(type='Resize', keep_ratio=True),
+                dict(type='RandomFlip'),
+                dict(type='Normalize', **img_norm_cfg),
+                dict(type='Pad', size_divisor=32),
+                dict(type='ImageToTensor', keys=['img']),
+                dict(type='Collect', keys=['img']),
+            ])
+    ]
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        img_prefix='tests/data',
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        load_from_file=False,
+        hierarchy_file=hierarchy_json,
+        pipeline=test_pipeline)
+    dataset.prepare_test_img(0)
+    assert len(dataset.test_img_metas) == 1
+    result = _create_dummy_results()
+    dataset.evaluate(result)
+
+    # test get hierarchy for classes
+    hierarchy_json = osp.join(tmp_dir.name, 'hierarchy.json')
+    _create_hierarchy_json(hierarchy_json)
+
+    # test with hierarchy file wrong suffix
+    with pytest.raises(AssertionError):
+        fake_path = osp.join(tmp_dir.name, 'hierarchy.csv')
+        OpenImagesDataset(
+            ann_file=ann_file,
+            img_prefix='tests/data',
+            label_file=label_file,
+            image_level_ann_file=label_level_file,
+            load_from_file=False,
+            hierarchy_file=fake_path,
+            pipeline=test_pipeline)
+
+    # test load hierarchy file succseefully
+    hierarchy = dataset.get_relation_matrix(hierarchy_json)
+    hierarchy_gt = np.array([[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0],
+                             [1, 0, 0, 1]])
+    assert np.equal(hierarchy, hierarchy_gt).all()
+
+    # test evaluation
+    # create fake metas
+    meta_file = osp.join(tmp_dir.name, 'meta.pkl')
+    _create_metas(meta_file)
+
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_json,
+        meta_file=meta_file,
+        pipeline=[])
+    # test evaluation with using group_of, adding father classes to
+    # GT and annotations, and considering image_level_image,
+    # In the first label (Sports equipment): tp = [0, 1, 0, 0, 1],
+    # fp = [1, 0, 1, 1, 0]
+    # In the second label (Ball), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the third label (Football), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the forth label (Bicycle), tp = [0], fp = [1].
+    result = _create_dummy_results()
+    parsed_results = dataset.evaluate(result)
+    assert np.isclose(parsed_results['mAP'], 0.8333, 1e-4)
+
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        load_image_level_labels=False,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_json,
+        meta_file=meta_file,
+        pipeline=[])
+
+    # test evaluation with using group_of, adding father classes to
+    # GT and annotations, and not considering image_level_image,
+    # In the first label (Sports equipment): tp = [0, 1, 0, 0, 1],
+    # fp = [1, 0, 1, 1, 0]
+    # In the second label (Ball), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the third label (Football), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the forth label (Bicycle), tp = [], fp = [].
+    result = _create_dummy_results()
+    parsed_results = dataset.evaluate(result)
+    assert np.isclose(parsed_results['mAP'], 0.8333, 1e-4)
+    tmp_dir.cleanup()
+
+
+def test_openimages_challenge_dataset():
+    # create fake ann files
+    tmp_dir = tempfile.TemporaryDirectory()
+    ann_file = osp.join(tmp_dir.name, 'ann_file.txt')
+    label_file = osp.join(tmp_dir.name, 'label_file.csv')
+    label_level_file = osp.join(tmp_dir.name, 'label_level_file.csv')
+    _creat_oid_challenge_style_ann(ann_file, label_file, label_level_file)
+
+    dataset = OpenImagesChallengeDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        load_image_level_labels=False,
+        get_supercategory=False,
+        pipeline=[])
+    ann = dataset.get_ann_info(0)
+
+    # two legal detection bboxes with `group_of` parameter
+    assert ann['bboxes'].shape[0] == ann['labels'].shape[0] == \
+           ann['gt_is_group_ofs'].shape[0] == 2
+
+    dataset.prepare_train_img(0)
+    dataset.prepare_test_img(0)
+
+    meta_file = osp.join(tmp_dir.name, 'meta.pkl')
+    _create_metas(meta_file)
+
+    result = _create_dummy_results()
+    with pytest.raises(AssertionError):
+        fake_json = osp.join(tmp_dir.name, 'hierarchy.json')
+        OpenImagesChallengeDataset(
+            ann_file=ann_file,
+            label_file=label_file,
+            image_level_ann_file=label_level_file,
+            hierarchy_file=fake_json,
+            meta_file=meta_file,
+            pipeline=[])
+
+    hierarchy_file = osp.join(tmp_dir.name, 'hierarchy.np')
+    _create_hierarchy_np(hierarchy_file)
+    dataset = OpenImagesChallengeDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_file,
+        meta_file=meta_file,
+        pipeline=[])
+    dataset.evaluate(result)
+    tmp_dir.cleanup()
diff --git a/tests/test_data/test_datasets/test_panoptic_dataset.py b/tests/test_data/test_datasets/test_panoptic_dataset.py
new file mode 100755
index 0000000..376270d
--- /dev/null
+++ b/tests/test_data/test_datasets/test_panoptic_dataset.py
@@ -0,0 +1,456 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+
+import mmcv
+import numpy as np
+
+from mmdet.core import encode_mask_results
+from mmdet.datasets.api_wrappers import pq_compute_single_core
+from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET, CocoPanopticDataset
+
+try:
+    from panopticapi.utils import id2rgb
+except ImportError:
+    id2rgb = None
+
+
+def _create_panoptic_style_json(json_name):
+    image1 = {
+        'id': 0,
+        'width': 640,
+        'height': 640,
+        'file_name': 'fake_name1.jpg',
+    }
+
+    image2 = {
+        'id': 1,
+        'width': 640,
+        'height': 800,
+        'file_name': 'fake_name2.jpg',
+    }
+
+    images = [image1, image2]
+
+    annotations = [
+        {
+            'segments_info': [{
+                'id': 1,
+                'category_id': 0,
+                'area': 400,
+                'bbox': [50, 60, 20, 20],
+                'iscrowd': 0
+            }, {
+                'id': 2,
+                'category_id': 1,
+                'area': 900,
+                'bbox': [100, 120, 30, 30],
+                'iscrowd': 0
+            }, {
+                'id': 3,
+                'category_id': 2,
+                'iscrowd': 0,
+                'bbox': [1, 189, 612, 285],
+                'area': 70036
+            }],
+            'file_name':
+            'fake_name1.jpg',
+            'image_id':
+            0
+        },
+        {
+            'segments_info': [
+                {
+                    # Different to instance style json, there
+                    # are duplicate ids in panoptic style json
+                    'id': 1,
+                    'category_id': 0,
+                    'area': 400,
+                    'bbox': [50, 60, 20, 20],
+                    'iscrowd': 0
+                },
+                {
+                    'id': 4,
+                    'category_id': 1,
+                    'area': 900,
+                    'bbox': [100, 120, 30, 30],
+                    'iscrowd': 1
+                },
+                {
+                    'id': 5,
+                    'category_id': 2,
+                    'iscrowd': 0,
+                    'bbox': [100, 200, 200, 300],
+                    'area': 66666
+                },
+                {
+                    'id': 6,
+                    'category_id': 0,
+                    'iscrowd': 0,
+                    'bbox': [1, 189, -10, 285],
+                    'area': 70036
+                }
+            ],
+            'file_name':
+            'fake_name2.jpg',
+            'image_id':
+            1
+        }
+    ]
+
+    categories = [{
+        'id': 0,
+        'name': 'car',
+        'supercategory': 'car',
+        'isthing': 1
+    }, {
+        'id': 1,
+        'name': 'person',
+        'supercategory': 'person',
+        'isthing': 1
+    }, {
+        'id': 2,
+        'name': 'wall',
+        'supercategory': 'wall',
+        'isthing': 0
+    }]
+
+    fake_json = {
+        'images': images,
+        'annotations': annotations,
+        'categories': categories
+    }
+    mmcv.dump(fake_json, json_name)
+
+    return fake_json
+
+
+def test_load_panoptic_style_json():
+    tmp_dir = tempfile.TemporaryDirectory()
+    fake_json_file = osp.join(tmp_dir.name, 'fake_data.json')
+    fake_json = _create_panoptic_style_json(fake_json_file)
+
+    dataset = CocoPanopticDataset(
+        ann_file=fake_json_file,
+        classes=[cat['name'] for cat in fake_json['categories']],
+        pipeline=[])
+
+    ann = dataset.get_ann_info(0)
+
+    # two legal instances
+    assert ann['bboxes'].shape[0] == ann['labels'].shape[0] == 2
+    # three masks for both foreground and background
+    assert len(ann['masks']) == 3
+
+    ann = dataset.get_ann_info(1)
+
+    # one legal instance, one illegal instance,
+    # one crowd instance and one background mask
+    assert ann['bboxes'].shape[0] == ann['labels'].shape[0] == 1
+    assert ann['bboxes_ignore'].shape[0] == 1
+    assert len(ann['masks']) == 3
+
+
+def _create_panoptic_gt_annotations(ann_file):
+    categories = [{
+        'id': 0,
+        'name': 'person',
+        'supercategory': 'person',
+        'isthing': 1
+    }, {
+        'id': 1,
+        'name': 'dog',
+        'supercategory': 'dog',
+        'isthing': 1
+    }, {
+        'id': 2,
+        'name': 'wall',
+        'supercategory': 'wall',
+        'isthing': 0
+    }]
+
+    images = [{
+        'id': 0,
+        'width': 80,
+        'height': 60,
+        'file_name': 'fake_name1.jpg',
+    }]
+
+    annotations = [{
+        'segments_info': [{
+            'id': 1,
+            'category_id': 0,
+            'area': 400,
+            'bbox': [10, 10, 10, 40],
+            'iscrowd': 0
+        }, {
+            'id': 2,
+            'category_id': 0,
+            'area': 400,
+            'bbox': [30, 10, 10, 40],
+            'iscrowd': 0
+        }, {
+            'id': 3,
+            'category_id': 1,
+            'iscrowd': 0,
+            'bbox': [50, 10, 10, 5],
+            'area': 50
+        }, {
+            'id': 4,
+            'category_id': 2,
+            'iscrowd': 0,
+            'bbox': [0, 0, 80, 60],
+            'area': 3950
+        }],
+        'file_name':
+        'fake_name1.png',
+        'image_id':
+        0
+    }]
+
+    gt_json = {
+        'images': images,
+        'annotations': annotations,
+        'categories': categories
+    }
+
+    # 4 is the id of the background class annotation.
+    gt = np.zeros((60, 80), dtype=np.int64) + 4
+    gt_bboxes = np.array([[10, 10, 10, 40], [30, 10, 10, 40], [50, 10, 10, 5]],
+                         dtype=np.int64)
+    for i in range(3):
+        x, y, w, h = gt_bboxes[i]
+        gt[y:y + h, x:x + w] = i + 1  # id starts from 1
+
+    gt = id2rgb(gt).astype(np.uint8)
+    img_path = osp.join(osp.dirname(ann_file), 'fake_name1.png')
+    mmcv.imwrite(gt[:, :, ::-1], img_path)
+
+    mmcv.dump(gt_json, ann_file)
+    return gt_json
+
+
+def test_panoptic_evaluation():
+    if id2rgb is None:
+        return
+
+    # TP for background class, IoU=3576/4324=0.827
+    # 2 the category id of the background class
+    pred = np.zeros((60, 80), dtype=np.int64) + 2
+    pred_bboxes = np.array(
+        [
+            [11, 11, 10, 40],  # TP IoU=351/449=0.78
+            [38, 10, 10, 40],  # FP
+            [51, 10, 10, 5]
+        ],  # TP IoU=45/55=0.818
+        dtype=np.int64)
+    pred_labels = np.array([0, 0, 1], dtype=np.int64)
+    for i in range(3):
+        x, y, w, h = pred_bboxes[i]
+        pred[y:y + h, x:x + w] = (i + 1) * INSTANCE_OFFSET + pred_labels[i]
+
+    tmp_dir = tempfile.TemporaryDirectory()
+    ann_file = osp.join(tmp_dir.name, 'panoptic.json')
+    gt_json = _create_panoptic_gt_annotations(ann_file)
+
+    results = [{'pan_results': pred}]
+
+    dataset = CocoPanopticDataset(
+        ann_file=ann_file,
+        seg_prefix=tmp_dir.name,
+        classes=[cat['name'] for cat in gt_json['categories']],
+        pipeline=[])
+
+    # For 'person', sq = 0.78 / 1, rq = 1 / 2( 1 tp + 0.5 * (1 fn + 1 fp))
+    # For 'dog', sq = 0.818, rq = 1 / 1
+    # For 'wall', sq = 0.827, rq = 1 / 1
+    # Here is the results for all classes:
+    # +--------+--------+--------+---------+------------+
+    # |        | PQ     | SQ     | RQ      | categories |
+    # +--------+--------+--------+---------+------------+
+    # | All    | 67.869 | 80.898 | 83.333  |      3     |
+    # | Things | 60.453 | 79.996 | 75.000  |      2     |
+    # | Stuff  | 82.701 | 82.701 | 100.000 |      1     |
+    # +--------+--------+--------+---------+------------+
+    parsed_results = dataset.evaluate(results)
+    assert np.isclose(parsed_results['PQ'], 67.869)
+    assert np.isclose(parsed_results['SQ'], 80.898)
+    assert np.isclose(parsed_results['RQ'], 83.333)
+    assert np.isclose(parsed_results['PQ_th'], 60.453)
+    assert np.isclose(parsed_results['SQ_th'], 79.996)
+    assert np.isclose(parsed_results['RQ_th'], 75.000)
+    assert np.isclose(parsed_results['PQ_st'], 82.701)
+    assert np.isclose(parsed_results['SQ_st'], 82.701)
+    assert np.isclose(parsed_results['RQ_st'], 100.000)
+
+    # test jsonfile_prefix
+    outfile_prefix = osp.join(tmp_dir.name, 'results')
+    parsed_results = dataset.evaluate(results, jsonfile_prefix=outfile_prefix)
+    assert np.isclose(parsed_results['PQ'], 67.869)
+    assert np.isclose(parsed_results['SQ'], 80.898)
+    assert np.isclose(parsed_results['RQ'], 83.333)
+    assert np.isclose(parsed_results['PQ_th'], 60.453)
+    assert np.isclose(parsed_results['SQ_th'], 79.996)
+    assert np.isclose(parsed_results['RQ_th'], 75.000)
+    assert np.isclose(parsed_results['PQ_st'], 82.701)
+    assert np.isclose(parsed_results['SQ_st'], 82.701)
+    assert np.isclose(parsed_results['RQ_st'], 100.000)
+
+    # test classwise
+    parsed_results = dataset.evaluate(results, classwise=True)
+    assert np.isclose(parsed_results['PQ'], 67.869)
+    assert np.isclose(parsed_results['SQ'], 80.898)
+    assert np.isclose(parsed_results['RQ'], 83.333)
+    assert np.isclose(parsed_results['PQ_th'], 60.453)
+    assert np.isclose(parsed_results['SQ_th'], 79.996)
+    assert np.isclose(parsed_results['RQ_th'], 75.000)
+    assert np.isclose(parsed_results['PQ_st'], 82.701)
+    assert np.isclose(parsed_results['SQ_st'], 82.701)
+    assert np.isclose(parsed_results['RQ_st'], 100.000)
+
+    # test the api wrapper of `pq_compute_single_core`
+    # Codes are copied from `coco_panoptic.py` and modified
+    result_files, _ = dataset.format_results(
+        results, jsonfile_prefix=outfile_prefix)
+
+    imgs = dataset.coco.imgs
+    gt_json = dataset.coco.img_ann_map  # image to annotations
+    gt_json = [{
+        'image_id': k,
+        'segments_info': v,
+        'file_name': imgs[k]['segm_file']
+    } for k, v in gt_json.items()]
+    pred_json = mmcv.load(result_files['panoptic'])
+    pred_json = dict((el['image_id'], el) for el in pred_json['annotations'])
+
+    # match the gt_anns and pred_anns in the same image
+    matched_annotations_list = []
+    for gt_ann in gt_json:
+        img_id = gt_ann['image_id']
+        matched_annotations_list.append((gt_ann, pred_json[img_id]))
+    gt_folder = dataset.seg_prefix
+    pred_folder = osp.join(osp.dirname(outfile_prefix), 'panoptic')
+
+    pq_stat = pq_compute_single_core(0, matched_annotations_list, gt_folder,
+                                     pred_folder, dataset.categories)
+    pq_all = pq_stat.pq_average(dataset.categories, isthing=None)[0]
+    assert np.isclose(pq_all['pq'] * 100, 67.869)
+    assert np.isclose(pq_all['sq'] * 100, 80.898)
+    assert np.isclose(pq_all['rq'] * 100, 83.333)
+    assert pq_all['n'] == 3
+
+
+def _create_instance_segmentation_gt_annotations(ann_file):
+    categories = [{
+        'id': 0,
+        'name': 'person',
+        'supercategory': 'person',
+        'isthing': 1
+    }, {
+        'id': 1,
+        'name': 'dog',
+        'supercategory': 'dog',
+        'isthing': 1
+    }, {
+        'id': 2,
+        'name': 'wall',
+        'supercategory': 'wall',
+        'isthing': 0
+    }]
+
+    images = [{
+        'id': 0,
+        'width': 80,
+        'height': 60,
+        'file_name': 'fake_name1.jpg',
+    }]
+
+    person1_polygon = [10, 10, 20, 10, 20, 50, 10, 50, 10, 10]
+    person2_polygon = [30, 10, 40, 10, 40, 50, 30, 50, 30, 10]
+    dog_polygon = [50, 10, 60, 10, 60, 15, 50, 15, 50, 10]
+
+    annotations = [
+        {
+            'id': 0,
+            'image_id': 0,
+            'category_id': 0,
+            'segmentation': [person1_polygon],
+            'area': 400,
+            'bbox': [10, 10, 10, 40],
+            'iscrowd': 0
+        },
+        {
+            'id': 1,
+            'image_id': 0,
+            'category_id': 0,
+            'segmentation': [person2_polygon],
+            'area': 400,
+            'bbox': [30, 10, 10, 40],
+            'iscrowd': 0
+        },
+        {
+            'id': 2,
+            'image_id': 0,
+            'category_id': 1,
+            'segmentation': [dog_polygon],
+            'area': 50,
+            'bbox': [50, 10, 10, 5],
+            'iscrowd': 0
+        },
+    ]
+
+    gt_json = {
+        'images': images,
+        'annotations': annotations,
+        'categories': categories
+    }
+
+    mmcv.dump(gt_json, ann_file)
+
+
+def test_instance_segmentation_evaluation():
+    pred_bbox = [
+        np.array([[11, 10, 20, 50, 0.8], [31, 10, 40, 50, 0.8]]),
+        np.array([[51, 10, 60, 15, 0.7]])
+    ]
+
+    person1_mask = np.zeros((60, 80), dtype=bool)
+    person1_mask[20:50, 11:20] = True
+    person2_mask = np.zeros((60, 80), dtype=bool)
+    person2_mask[20:50, 31:40] = True
+    dog_mask = np.zeros((60, 80), dtype=bool)
+    dog_mask[10:15, 51:60] = True
+
+    pred_mask = [[person1_mask, person2_mask], [
+        dog_mask,
+    ]]
+    results = [{'ins_results': (pred_bbox, encode_mask_results(pred_mask))}]
+
+    tmp_dir = tempfile.TemporaryDirectory()
+    pan_ann_file = osp.join(tmp_dir.name, 'panoptic.json')
+    ins_ann_file = osp.join(tmp_dir.name, 'instance.json')
+    _create_panoptic_gt_annotations(pan_ann_file)
+    _create_instance_segmentation_gt_annotations(ins_ann_file)
+
+    dataset = CocoPanopticDataset(
+        ann_file=pan_ann_file,
+        ins_ann_file=ins_ann_file,
+        seg_prefix=tmp_dir.name,
+        pipeline=[])
+    dataset.THING_CLASSES = ['person', 'dog']
+    dataset.STUFF_CLASSES = ['wall']
+    dataset.CLASSES = dataset.THING_CLASSES + dataset.STUFF_CLASSES
+    parsed_results = dataset.evaluate(results, metric=['segm', 'bbox'])
+
+    # Here is the results for instance segmentation:
+    # {
+    #     'segm_mAP': 0.5005, 'segm_mAP_50': 0.626, 'segm_mAP_75': 0.5,
+    #     'segm_mAP_s': 0.5, 'segm_mAP_m': -1.0, 'segm_mAP_l': -1.0,
+    #     'segm_mAP_copypaste': '0.500 0.626 0.500 0.500 -1.000 -1.000',
+    #     'bbox_mAP': 0.5636, 'bbox_mAP_50': 0.626, 'bbox_mAP_75': 0.626,
+    #     'bbox_mAP_s': 0.564, 'bbox_mAP_m': -1.0, 'bbox_mAP_l': -1.0,
+    #     'bbox_mAP_copypaste': '0.564 0.626 0.626 0.564 -1.000 -1.000'
+    # }
+
+    assert np.isclose(parsed_results['segm_mAP'], 0.5005)
+    assert np.isclose(parsed_results['bbox_mAP'], 0.5636)
diff --git a/tests/test_data/test_datasets/test_xml_dataset.py b/tests/test_data/test_datasets/test_xml_dataset.py
new file mode 100755
index 0000000..f72f13d
--- /dev/null
+++ b/tests/test_data/test_datasets/test_xml_dataset.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+from mmdet.datasets import DATASETS
+
+
+def test_xml_dataset():
+    dataconfig = {
+        'ann_file': 'data/VOCdevkit/VOC2007/ImageSets/Main/test.txt',
+        'img_prefix': 'data/VOCdevkit/VOC2007/',
+        'pipeline': [{
+            'type': 'LoadImageFromFile'
+        }]
+    }
+    XMLDataset = DATASETS.get('XMLDataset')
+
+    class XMLDatasetSubClass(XMLDataset):
+        CLASSES = None
+
+    # get_ann_info and _filter_imgs of XMLDataset
+    # would use self.CLASSES, we added CLASSES not NONE
+    with pytest.raises(AssertionError):
+        XMLDatasetSubClass(**dataconfig)
diff --git a/tests/test_data/test_pipelines/test_formatting.py b/tests/test_data/test_pipelines/test_formatting.py
new file mode 100755
index 0000000..2e22898
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_formatting.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+from mmcv.utils import build_from_cfg
+
+from mmdet.datasets.builder import PIPELINES
+
+
+def test_default_format_bundle():
+    results = dict(
+        img_prefix=osp.join(osp.dirname(__file__), '../../data'),
+        img_info=dict(filename='color.jpg'))
+    load = dict(type='LoadImageFromFile')
+    load = build_from_cfg(load, PIPELINES)
+    bundle = dict(type='DefaultFormatBundle')
+    bundle = build_from_cfg(bundle, PIPELINES)
+    results = load(results)
+    assert 'pad_shape' not in results
+    assert 'scale_factor' not in results
+    assert 'img_norm_cfg' not in results
+    results = bundle(results)
+    assert 'pad_shape' in results
+    assert 'scale_factor' in results
+    assert 'img_norm_cfg' in results
diff --git a/tests/test_data/test_pipelines/test_loading.py b/tests/test_data/test_pipelines/test_loading.py
new file mode 100755
index 0000000..27ecccf
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_loading.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+
+import mmcv
+import numpy as np
+import pytest
+
+from mmdet.core.mask import BitmapMasks, PolygonMasks
+from mmdet.datasets.pipelines import (FilterAnnotations, LoadImageFromFile,
+                                      LoadImageFromWebcam,
+                                      LoadMultiChannelImageFromFiles)
+
+
+class TestLoading:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_prefix = osp.join(osp.dirname(__file__), '../../data')
+
+    def test_load_img(self):
+        results = dict(
+            img_prefix=self.data_prefix, img_info=dict(filename='color.jpg'))
+        transform = LoadImageFromFile()
+        results = transform(copy.deepcopy(results))
+        assert results['filename'] == osp.join(self.data_prefix, 'color.jpg')
+        assert results['ori_filename'] == 'color.jpg'
+        assert results['img'].shape == (288, 512, 3)
+        assert results['img'].dtype == np.uint8
+        assert results['img_shape'] == (288, 512, 3)
+        assert results['ori_shape'] == (288, 512, 3)
+        assert repr(transform) == transform.__class__.__name__ + \
+            "(to_float32=False, color_type='color', channel_order='bgr', " + \
+            "file_client_args={'backend': 'disk'})"
+
+        # no img_prefix
+        results = dict(
+            img_prefix=None, img_info=dict(filename='tests/data/color.jpg'))
+        transform = LoadImageFromFile()
+        results = transform(copy.deepcopy(results))
+        assert results['filename'] == 'tests/data/color.jpg'
+        assert results['ori_filename'] == 'tests/data/color.jpg'
+        assert results['img'].shape == (288, 512, 3)
+
+        # to_float32
+        transform = LoadImageFromFile(to_float32=True)
+        results = transform(copy.deepcopy(results))
+        assert results['img'].dtype == np.float32
+
+        # gray image
+        results = dict(
+            img_prefix=self.data_prefix, img_info=dict(filename='gray.jpg'))
+        transform = LoadImageFromFile()
+        results = transform(copy.deepcopy(results))
+        assert results['img'].shape == (288, 512, 3)
+        assert results['img'].dtype == np.uint8
+
+        transform = LoadImageFromFile(color_type='unchanged')
+        results = transform(copy.deepcopy(results))
+        assert results['img'].shape == (288, 512)
+        assert results['img'].dtype == np.uint8
+
+    def test_load_multi_channel_img(self):
+        results = dict(
+            img_prefix=self.data_prefix,
+            img_info=dict(filename=['color.jpg', 'color.jpg']))
+        transform = LoadMultiChannelImageFromFiles()
+        results = transform(copy.deepcopy(results))
+        assert results['filename'] == [
+            osp.join(self.data_prefix, 'color.jpg'),
+            osp.join(self.data_prefix, 'color.jpg')
+        ]
+        assert results['ori_filename'] == ['color.jpg', 'color.jpg']
+        assert results['img'].shape == (288, 512, 3, 2)
+        assert results['img'].dtype == np.uint8
+        assert results['img_shape'] == (288, 512, 3, 2)
+        assert results['ori_shape'] == (288, 512, 3, 2)
+        assert results['pad_shape'] == (288, 512, 3, 2)
+        assert results['scale_factor'] == 1.0
+        assert repr(transform) == transform.__class__.__name__ + \
+            "(to_float32=False, color_type='unchanged', " + \
+            "file_client_args={'backend': 'disk'})"
+
+    def test_load_webcam_img(self):
+        img = mmcv.imread(osp.join(self.data_prefix, 'color.jpg'))
+        results = dict(img=img)
+        transform = LoadImageFromWebcam()
+        results = transform(copy.deepcopy(results))
+        assert results['filename'] is None
+        assert results['ori_filename'] is None
+        assert results['img'].shape == (288, 512, 3)
+        assert results['img'].dtype == np.uint8
+        assert results['img_shape'] == (288, 512, 3)
+        assert results['ori_shape'] == (288, 512, 3)
+
+
+def _build_filter_annotations_args():
+    kwargs = (dict(min_gt_bbox_wh=(100, 100)),
+              dict(min_gt_bbox_wh=(100, 100), keep_empty=False),
+              dict(min_gt_bbox_wh=(1, 1)), dict(min_gt_bbox_wh=(.01, .01)),
+              dict(min_gt_bbox_wh=(.01, .01),
+                   by_mask=True), dict(by_mask=True),
+              dict(by_box=False, by_mask=True))
+    targets = (None, 0, 1, 2, 1, 1, 1)
+
+    return list(zip(targets, kwargs))
+
+
+@pytest.mark.parametrize('target, kwargs', _build_filter_annotations_args())
+def test_filter_annotations(target, kwargs):
+    filter_ann = FilterAnnotations(**kwargs)
+    bboxes = np.array([[2., 10., 4., 14.], [2., 10., 2.1, 10.1]])
+    raw_masks = np.zeros((2, 24, 24))
+    raw_masks[0, 10:14, 2:4] = 1
+    bitmap_masks = BitmapMasks(raw_masks, 24, 24)
+    results = dict(gt_bboxes=bboxes, gt_masks=bitmap_masks)
+    results = filter_ann(results)
+    if results is not None:
+        results = results['gt_bboxes'].shape[0]
+    assert results == target
+
+    polygons = [[np.array([2.0, 10.0, 4.0, 10.0, 4.0, 14.0, 2.0, 14.0])],
+                [np.array([2.0, 10.0, 2.1, 10.0, 2.1, 10.1, 2.0, 10.1])]]
+    polygon_masks = PolygonMasks(polygons, 24, 24)
+
+    results = dict(gt_bboxes=bboxes, gt_masks=polygon_masks)
+    results = filter_ann(results)
+
+    if results is not None:
+        results = len(results.get('gt_masks').masks)
+
+    assert results == target
diff --git a/tests/test_data/test_pipelines/test_sampler.py b/tests/test_data/test_pipelines/test_sampler.py
new file mode 100755
index 0000000..8ff9398
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_sampler.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.bbox.assigners import MaxIoUAssigner
+from mmdet.core.bbox.samplers import (OHEMSampler, RandomSampler,
+                                      ScoreHLRSampler)
+
+
+def test_random_sampler():
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([1, 2])
+    gt_bboxes_ignore = torch.Tensor([
+        [30, 30, 40, 40],
+    ])
+    assign_result = assigner.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+
+    sampler = RandomSampler(
+        num=10, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=True)
+
+    sample_result = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+
+def test_random_sampler_empty_gt():
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.empty(0, 4)
+    gt_labels = torch.empty(0, ).long()
+    assign_result = assigner.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+
+    sampler = RandomSampler(
+        num=10, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=True)
+
+    sample_result = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+
+def test_random_sampler_empty_pred():
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    bboxes = torch.empty(0, 4)
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([1, 2])
+    assign_result = assigner.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+
+    sampler = RandomSampler(
+        num=10, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=True)
+
+    sample_result = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+
+def _context_for_ohem():
+    import sys
+    from os.path import dirname
+    sys.path.insert(0, dirname(dirname(dirname(__file__))))
+    from test_models.test_forward import _get_detector_cfg
+
+    model = _get_detector_cfg(
+        'faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py')
+    model['pretrained'] = None
+
+    from mmdet.models import build_detector
+    context = build_detector(model).roi_head
+    return context
+
+
+def test_ohem_sampler():
+
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([1, 2])
+    gt_bboxes_ignore = torch.Tensor([
+        [30, 30, 40, 40],
+    ])
+    assign_result = assigner.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+
+    context = _context_for_ohem()
+
+    sampler = OHEMSampler(
+        num=10,
+        pos_fraction=0.5,
+        context=context,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+
+    feats = [torch.rand(1, 256, int(2**i), int(2**i)) for i in [6, 5, 4, 3, 2]]
+    sample_result = sampler.sample(
+        assign_result, bboxes, gt_bboxes, gt_labels, feats=feats)
+
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+
+def test_ohem_sampler_empty_gt():
+
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.empty(0, 4)
+    gt_labels = torch.LongTensor([])
+    gt_bboxes_ignore = torch.Tensor([])
+    assign_result = assigner.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+
+    context = _context_for_ohem()
+
+    sampler = OHEMSampler(
+        num=10,
+        pos_fraction=0.5,
+        context=context,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+
+    feats = [torch.rand(1, 256, int(2**i), int(2**i)) for i in [6, 5, 4, 3, 2]]
+
+    sample_result = sampler.sample(
+        assign_result, bboxes, gt_bboxes, gt_labels, feats=feats)
+
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+
+def test_ohem_sampler_empty_pred():
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    bboxes = torch.empty(0, 4)
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_labels = torch.LongTensor([1, 2, 2, 3])
+    gt_bboxes_ignore = torch.Tensor([])
+    assign_result = assigner.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+
+    context = _context_for_ohem()
+
+    sampler = OHEMSampler(
+        num=10,
+        pos_fraction=0.5,
+        context=context,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+
+    feats = [torch.rand(1, 256, int(2**i), int(2**i)) for i in [6, 5, 4, 3, 2]]
+
+    sample_result = sampler.sample(
+        assign_result, bboxes, gt_bboxes, gt_labels, feats=feats)
+
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+
+def test_random_sample_result():
+    from mmdet.core.bbox.samplers.sampling_result import SamplingResult
+    SamplingResult.random(num_gts=0, num_preds=0)
+    SamplingResult.random(num_gts=0, num_preds=3)
+    SamplingResult.random(num_gts=3, num_preds=3)
+    SamplingResult.random(num_gts=0, num_preds=3)
+    SamplingResult.random(num_gts=7, num_preds=7)
+    SamplingResult.random(num_gts=7, num_preds=64)
+    SamplingResult.random(num_gts=24, num_preds=3)
+
+    for i in range(3):
+        SamplingResult.random(rng=i)
+
+
+def test_score_hlr_sampler_empty_pred():
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    context = _context_for_ohem()
+    sampler = ScoreHLRSampler(
+        num=10,
+        pos_fraction=0.5,
+        context=context,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+    gt_bboxes_ignore = torch.Tensor([])
+    feats = [torch.rand(1, 256, int(2**i), int(2**i)) for i in [6, 5, 4, 3, 2]]
+
+    # empty bbox
+    bboxes = torch.empty(0, 4)
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_labels = torch.LongTensor([1, 2, 2, 3])
+    assign_result = assigner.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+    sample_result, _ = sampler.sample(
+        assign_result, bboxes, gt_bboxes, gt_labels, feats=feats)
+    assert len(sample_result.neg_inds) == 0
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+    # empty gt
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.empty(0, 4)
+    gt_labels = torch.LongTensor([])
+    assign_result = assigner.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+    sample_result, _ = sampler.sample(
+        assign_result, bboxes, gt_bboxes, gt_labels, feats=feats)
+    assert len(sample_result.pos_inds) == 0
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
+
+    # non-empty input
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_labels = torch.LongTensor([1, 2, 2, 3])
+    assign_result = assigner.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+    sample_result, _ = sampler.sample(
+        assign_result, bboxes, gt_bboxes, gt_labels, feats=feats)
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
diff --git a/tests/test_data/test_pipelines/test_transform/__init__.py b/tests/test_data/test_pipelines/test_transform/__init__.py
new file mode 100755
index 0000000..d499031
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .utils import check_result_same, construct_toy_data, create_random_bboxes
+
+__all__ = ['create_random_bboxes', 'construct_toy_data', 'check_result_same']
diff --git a/tests/test_data/test_pipelines/test_transform/test_img_augment.py b/tests/test_data/test_pipelines/test_transform/test_img_augment.py
new file mode 100755
index 0000000..f28030e
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/test_img_augment.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import mmcv
+import numpy as np
+from mmcv.utils import build_from_cfg
+from numpy.testing import assert_array_equal
+
+from mmdet.datasets.builder import PIPELINES
+from .utils import construct_toy_data
+
+
+def test_adjust_color():
+    results = construct_toy_data()
+    # test wighout aug
+    transform = dict(type='ColorTransform', prob=0, level=10)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], results['img'])
+
+    # test with factor 1
+    img = results['img']
+    transform = dict(type='ColorTransform', prob=1, level=10)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], img)
+
+    # test with factor 0
+    transform_module.factor = 0
+    img_gray = mmcv.bgr2gray(img.copy())
+    img_r = np.stack([img_gray, img_gray, img_gray], axis=-1)
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], img_r)
+
+    # test with factor 0.5
+    transform_module.factor = 0.5
+    results_transformed = transform_module(copy.deepcopy(results))
+    img = results['img']
+    assert_array_equal(
+        results_transformed['img'],
+        np.round(np.clip((img * 0.5 + img_r * 0.5), 0, 255)).astype(img.dtype))
+
+
+def test_imequalize(nb_rand_test=100):
+
+    def _imequalize(img):
+        # equalize the image using PIL.ImageOps.equalize
+        from PIL import Image, ImageOps
+        img = Image.fromarray(img)
+        equalized_img = np.asarray(ImageOps.equalize(img))
+        return equalized_img
+
+    results = construct_toy_data()
+    # test wighout aug
+    transform = dict(type='EqualizeTransform', prob=0)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], results['img'])
+
+    # test equalize with case step=0
+    transform = dict(type='EqualizeTransform', prob=1.)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    img = np.array([[0, 0, 0], [120, 120, 120], [255, 255, 255]],
+                   dtype=np.uint8)
+    img = np.stack([img, img, img], axis=-1)
+    results['img'] = img
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], img)
+
+    # test equalize with randomly sampled image.
+    for _ in range(nb_rand_test):
+        img = np.clip(np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
+                      255).astype(np.uint8)
+        results['img'] = img
+        results_transformed = transform_module(copy.deepcopy(results))
+        assert_array_equal(results_transformed['img'], _imequalize(img))
+
+
+def test_adjust_brightness(nb_rand_test=100):
+
+    def _adjust_brightness(img, factor):
+        # adjust the brightness of image using
+        # PIL.ImageEnhance.Brightness
+        from PIL import Image
+        from PIL.ImageEnhance import Brightness
+        img = Image.fromarray(img)
+        brightened_img = Brightness(img).enhance(factor)
+        return np.asarray(brightened_img)
+
+    results = construct_toy_data()
+    # test wighout aug
+    transform = dict(type='BrightnessTransform', level=10, prob=0)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], results['img'])
+
+    # test case with factor 1.0
+    transform = dict(type='BrightnessTransform', level=10, prob=1.)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    transform_module.factor = 1.0
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], results['img'])
+
+    # test case with factor 0.0
+    transform_module.factor = 0.0
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'],
+                       np.zeros_like(results['img']))
+
+    # test with randomly sampled images and factors.
+    for _ in range(nb_rand_test):
+        img = np.clip(np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
+                      255).astype(np.uint8)
+        factor = np.random.uniform()
+        transform_module.factor = factor
+        results['img'] = img
+        np.testing.assert_allclose(
+            transform_module(copy.deepcopy(results))['img'].astype(np.int32),
+            _adjust_brightness(img, factor).astype(np.int32),
+            rtol=0,
+            atol=1)
+
+
+def test_adjust_contrast(nb_rand_test=100):
+
+    def _adjust_contrast(img, factor):
+        from PIL import Image
+        from PIL.ImageEnhance import Contrast
+
+        # Image.fromarray defaultly supports RGB, not BGR.
+        # convert from BGR to RGB
+        img = Image.fromarray(img[..., ::-1], mode='RGB')
+        contrasted_img = Contrast(img).enhance(factor)
+        # convert from RGB to BGR
+        return np.asarray(contrasted_img)[..., ::-1]
+
+    results = construct_toy_data()
+    # test wighout aug
+    transform = dict(type='ContrastTransform', level=10, prob=0)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], results['img'])
+
+    # test case with factor 1.0
+    transform = dict(type='ContrastTransform', level=10, prob=1.)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    transform_module.factor = 1.0
+    results_transformed = transform_module(copy.deepcopy(results))
+    assert_array_equal(results_transformed['img'], results['img'])
+
+    # test case with factor 0.0
+    transform_module.factor = 0.0
+    results_transformed = transform_module(copy.deepcopy(results))
+    np.testing.assert_allclose(
+        results_transformed['img'],
+        _adjust_contrast(results['img'], 0.),
+        rtol=0,
+        atol=1)
+
+    # test adjust_contrast with randomly sampled images and factors.
+    for _ in range(nb_rand_test):
+        img = np.clip(np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,
+                      255).astype(np.uint8)
+        factor = np.random.uniform()
+        transform_module.factor = factor
+        results['img'] = img
+        results_transformed = transform_module(copy.deepcopy(results))
+        # Note the gap (less_equal 1) between PIL.ImageEnhance.Contrast
+        # and mmcv.adjust_contrast comes from the gap that converts from
+        # a color image to gray image using mmcv or PIL.
+        np.testing.assert_allclose(
+            transform_module(copy.deepcopy(results))['img'].astype(np.int32),
+            _adjust_contrast(results['img'], factor).astype(np.int32),
+            rtol=0,
+            atol=1)
diff --git a/tests/test_data/test_pipelines/test_transform/test_models_aug_test.py b/tests/test_data/test_pipelines/test_transform/test_models_aug_test.py
new file mode 100755
index 0000000..5eba1ef
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/test_models_aug_test.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmcv
+import torch
+from mmcv.parallel import collate
+from mmcv.utils import build_from_cfg
+
+from mmdet.datasets.builder import PIPELINES
+from mmdet.models import build_detector
+
+
+def model_aug_test_template(cfg_file):
+    # get config
+    cfg = mmcv.Config.fromfile(cfg_file)
+    # init model
+    cfg.model.pretrained = None
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model)
+
+    # init test pipeline and set aug test
+    load_cfg, multi_scale_cfg = cfg.test_pipeline
+    multi_scale_cfg['flip'] = True
+    multi_scale_cfg['flip_direction'] = ['horizontal', 'vertical', 'diagonal']
+    multi_scale_cfg['img_scale'] = [(1333, 800), (800, 600), (640, 480)]
+
+    load = build_from_cfg(load_cfg, PIPELINES)
+    transform = build_from_cfg(multi_scale_cfg, PIPELINES)
+
+    results = dict(
+        img_prefix=osp.join(osp.dirname(__file__), '../../../data'),
+        img_info=dict(filename='color.jpg'))
+    results = transform(load(results))
+    assert len(results['img']) == 12
+    assert len(results['img_metas']) == 12
+
+    results['img'] = [collate([x]) for x in results['img']]
+    results['img_metas'] = [collate([x]).data[0] for x in results['img_metas']]
+    # aug test the model
+    model.eval()
+    with torch.no_grad():
+        aug_result = model(return_loss=False, rescale=True, **results)
+    return aug_result
+
+
+def test_aug_test_size():
+    results = dict(
+        img_prefix=osp.join(osp.dirname(__file__), '../../../data'),
+        img_info=dict(filename='color.jpg'))
+
+    # Define simple pipeline
+    load = dict(type='LoadImageFromFile')
+    load = build_from_cfg(load, PIPELINES)
+
+    # get config
+    transform = dict(
+        type='MultiScaleFlipAug',
+        transforms=[],
+        img_scale=[(1333, 800), (800, 600), (640, 480)],
+        flip=True,
+        flip_direction=['horizontal', 'vertical', 'diagonal'])
+    multi_aug_test_module = build_from_cfg(transform, PIPELINES)
+
+    results = load(results)
+    results = multi_aug_test_module(load(results))
+    # len(["original", "horizontal", "vertical", "diagonal"]) *
+    # len([(1333, 800), (800, 600), (640, 480)])
+    assert len(results['img']) == 12
+
+
+def test_cascade_rcnn_aug_test():
+    aug_result = model_aug_test_template(
+        'configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py')
+    assert len(aug_result[0]) == 80
+
+
+def test_mask_rcnn_aug_test():
+    aug_result = model_aug_test_template(
+        'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py')
+    assert len(aug_result[0]) == 2
+    assert len(aug_result[0][0]) == 80
+    assert len(aug_result[0][1]) == 80
+
+
+def test_htc_aug_test():
+    aug_result = model_aug_test_template('configs/htc/htc_r50_fpn_1x_coco.py')
+    assert len(aug_result[0]) == 2
+    assert len(aug_result[0][0]) == 80
+    assert len(aug_result[0][1]) == 80
+
+
+def test_scnet_aug_test():
+    aug_result = model_aug_test_template(
+        'configs/scnet/scnet_r50_fpn_1x_coco.py')
+    assert len(aug_result[0]) == 2
+    assert len(aug_result[0][0]) == 80
+    assert len(aug_result[0][1]) == 80
+
+
+def test_cornernet_aug_test():
+    # get config
+    cfg = mmcv.Config.fromfile(
+        'configs/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco.py')
+    # init model
+    cfg.model.pretrained = None
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model)
+
+    # init test pipeline and set aug test
+    load_cfg, multi_scale_cfg = cfg.test_pipeline
+    multi_scale_cfg['flip'] = True
+    multi_scale_cfg['flip_direction'] = ['horizontal', 'vertical', 'diagonal']
+    multi_scale_cfg['scale_factor'] = [0.5, 1.0, 2.0]
+
+    load = build_from_cfg(load_cfg, PIPELINES)
+    transform = build_from_cfg(multi_scale_cfg, PIPELINES)
+
+    results = dict(
+        img_prefix=osp.join(osp.dirname(__file__), '../../../data'),
+        img_info=dict(filename='color.jpg'))
+    results = transform(load(results))
+    assert len(results['img']) == 12
+    assert len(results['img_metas']) == 12
+
+    results['img'] = [collate([x]) for x in results['img']]
+    results['img_metas'] = [collate([x]).data[0] for x in results['img_metas']]
+    # aug test the model
+    model.eval()
+    with torch.no_grad():
+        aug_result = model(return_loss=False, rescale=True, **results)
+    assert len(aug_result[0]) == 80
diff --git a/tests/test_data/test_pipelines/test_transform/test_rotate.py b/tests/test_data/test_pipelines/test_transform/test_rotate.py
new file mode 100755
index 0000000..93f7749
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/test_rotate.py
@@ -0,0 +1,172 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import pytest
+from mmcv.utils import build_from_cfg
+
+from mmdet.core.mask import BitmapMasks, PolygonMasks
+from mmdet.datasets.builder import PIPELINES
+from .utils import check_result_same, construct_toy_data
+
+
+def test_rotate():
+    # test assertion for invalid type of max_rotate_angle
+    with pytest.raises(AssertionError):
+        transform = dict(type='Rotate', level=1, max_rotate_angle=(30, ))
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid type of scale
+    with pytest.raises(AssertionError):
+        transform = dict(type='Rotate', level=2, scale=(1.2, ))
+        build_from_cfg(transform, PIPELINES)
+
+    # test ValueError for invalid type of img_fill_val
+    with pytest.raises(ValueError):
+        transform = dict(
+            type='Rotate', level=2, img_fill_val=[
+                128,
+            ])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid number of elements in center
+    with pytest.raises(AssertionError):
+        transform = dict(type='Rotate', level=2, center=(0.5, ))
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid type of center
+    with pytest.raises(AssertionError):
+        transform = dict(type='Rotate', level=2, center=[0, 0])
+        build_from_cfg(transform, PIPELINES)
+
+    # test case when no rotate aug (level=0)
+    results = construct_toy_data()
+    img_fill_val = (104, 116, 124)
+    seg_ignore_label = 255
+    transform = dict(
+        type='Rotate',
+        level=0,
+        prob=1.,
+        img_fill_val=img_fill_val,
+        seg_ignore_label=seg_ignore_label,
+    )
+    rotate_module = build_from_cfg(transform, PIPELINES)
+    results_wo_rotate = rotate_module(copy.deepcopy(results))
+    check_result_same(results, results_wo_rotate)
+
+    # test case when no rotate aug (prob<=0)
+    transform = dict(
+        type='Rotate', level=10, prob=0., img_fill_val=img_fill_val, scale=0.6)
+    rotate_module = build_from_cfg(transform, PIPELINES)
+    results_wo_rotate = rotate_module(copy.deepcopy(results))
+    check_result_same(results, results_wo_rotate)
+
+    # test clockwise rotation with angle 90
+    results = construct_toy_data()
+    img_fill_val = 128
+    transform = dict(
+        type='Rotate',
+        level=10,
+        max_rotate_angle=90,
+        img_fill_val=img_fill_val,
+        # set random_negative_prob to 0 for clockwise rotation
+        random_negative_prob=0.,
+        prob=1.)
+    rotate_module = build_from_cfg(transform, PIPELINES)
+    results_rotated = rotate_module(copy.deepcopy(results))
+    img_r = np.array([[img_fill_val, 6, 2, img_fill_val],
+                      [img_fill_val, 7, 3, img_fill_val]]).astype(np.uint8)
+    img_r = np.stack([img_r, img_r, img_r], axis=-1)
+    results_gt = copy.deepcopy(results)
+    results_gt['img'] = img_r
+    results_gt['gt_bboxes'] = np.array([[1., 0., 2., 1.]], dtype=np.float32)
+    results_gt['gt_bboxes_ignore'] = np.empty((0, 4), dtype=np.float32)
+    gt_masks = np.array([[0, 1, 1, 0], [0, 0, 1, 0]],
+                        dtype=np.uint8)[None, :, :]
+    results_gt['gt_masks'] = BitmapMasks(gt_masks, 2, 4)
+    results_gt['gt_semantic_seg'] = np.array(
+        [[255, 6, 2, 255], [255, 7, 3,
+                            255]]).astype(results['gt_semantic_seg'].dtype)
+    check_result_same(results_gt, results_rotated)
+
+    # test clockwise rotation with angle 90, PolygonMasks
+    results = construct_toy_data(poly2mask=False)
+    results_rotated = rotate_module(copy.deepcopy(results))
+    gt_masks = [[np.array([2, 0, 2, 1, 1, 1, 1, 0], dtype=np.float)]]
+    results_gt['gt_masks'] = PolygonMasks(gt_masks, 2, 4)
+    check_result_same(results_gt, results_rotated)
+
+    # test counter-clockwise rotation with angle 90,
+    # and specify the ratation center
+    img_fill_val = (104, 116, 124)
+    transform = dict(
+        type='Rotate',
+        level=10,
+        max_rotate_angle=90,
+        center=(0, 0),
+        img_fill_val=img_fill_val,
+        # set random_negative_prob to 1 for counter-clockwise rotation
+        random_negative_prob=1.,
+        prob=1.)
+    results = construct_toy_data()
+    rotate_module = build_from_cfg(transform, PIPELINES)
+    results_rotated = rotate_module(copy.deepcopy(results))
+    results_gt = copy.deepcopy(results)
+    h, w = results['img'].shape[:2]
+    img_r = np.stack([
+        np.ones((h, w)) * img_fill_val[0],
+        np.ones((h, w)) * img_fill_val[1],
+        np.ones((h, w)) * img_fill_val[2]
+    ],
+                     axis=-1).astype(np.uint8)
+    img_r[0, 0, :] = 1
+    img_r[0, 1, :] = 5
+    results_gt['img'] = img_r
+    results_gt['gt_bboxes'] = np.empty((0, 4), dtype=np.float32)
+    results_gt['gt_bboxes_ignore'] = np.empty((0, 4), dtype=np.float32)
+    results_gt['gt_labels'] = np.empty((0, ), dtype=np.int64)
+    gt_masks = np.empty((0, h, w), dtype=np.uint8)
+    results_gt['gt_masks'] = BitmapMasks(gt_masks, h, w)
+    gt_seg = (np.ones((h, w)) * 255).astype(results['gt_semantic_seg'].dtype)
+    gt_seg[0, 0], gt_seg[0, 1] = 1, 5
+    results_gt['gt_semantic_seg'] = gt_seg
+    check_result_same(results_gt, results_rotated)
+
+    transform = dict(
+        type='Rotate',
+        level=10,
+        max_rotate_angle=90,
+        center=(0),
+        img_fill_val=img_fill_val,
+        random_negative_prob=1.,
+        prob=1.)
+    rotate_module = build_from_cfg(transform, PIPELINES)
+    results_rotated = rotate_module(copy.deepcopy(results))
+    check_result_same(results_gt, results_rotated)
+
+    # test counter-clockwise rotation with angle 90,
+    # and specify the ratation center, PolygonMasks
+    results = construct_toy_data(poly2mask=False)
+    results_rotated = rotate_module(copy.deepcopy(results))
+    gt_masks = [[np.array([0, 0, 0, 0, 1, 0, 1, 0], dtype=np.float)]]
+    results_gt['gt_masks'] = PolygonMasks(gt_masks, 2, 4)
+    check_result_same(results_gt, results_rotated)
+
+    # test AutoAugment equipped with Rotate
+    policies = [[dict(type='Rotate', level=10, prob=1.)]]
+    autoaug = dict(type='AutoAugment', policies=policies)
+    autoaug_module = build_from_cfg(autoaug, PIPELINES)
+    autoaug_module(copy.deepcopy(results))
+
+    policies = [[
+        dict(type='Rotate', level=10, prob=1.),
+        dict(
+            type='Rotate',
+            level=8,
+            max_rotate_angle=90,
+            center=(0),
+            img_fill_val=img_fill_val)
+    ]]
+    autoaug = dict(type='AutoAugment', policies=policies)
+    autoaug_module = build_from_cfg(autoaug, PIPELINES)
+    autoaug_module(copy.deepcopy(results))
diff --git a/tests/test_data/test_pipelines/test_transform/test_shear.py b/tests/test_data/test_pipelines/test_transform/test_shear.py
new file mode 100755
index 0000000..215d9a3
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/test_shear.py
@@ -0,0 +1,164 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import pytest
+from mmcv.utils import build_from_cfg
+
+from mmdet.core.mask import BitmapMasks, PolygonMasks
+from mmdet.datasets.builder import PIPELINES
+from .utils import check_result_same, construct_toy_data
+
+
+def test_shear():
+    # test assertion for invalid type of max_shear_magnitude
+    with pytest.raises(AssertionError):
+        transform = dict(type='Shear', level=1, max_shear_magnitude=(0.5, ))
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid value of max_shear_magnitude
+    with pytest.raises(AssertionError):
+        transform = dict(type='Shear', level=2, max_shear_magnitude=1.2)
+        build_from_cfg(transform, PIPELINES)
+
+    # test ValueError for invalid type of img_fill_val
+    with pytest.raises(ValueError):
+        transform = dict(type='Shear', level=2, img_fill_val=[128])
+        build_from_cfg(transform, PIPELINES)
+
+    results = construct_toy_data()
+    # test case when no shear aug (level=0, direction='horizontal')
+    img_fill_val = (104, 116, 124)
+    seg_ignore_label = 255
+    transform = dict(
+        type='Shear',
+        level=0,
+        prob=1.,
+        img_fill_val=img_fill_val,
+        seg_ignore_label=seg_ignore_label,
+        direction='horizontal')
+    shear_module = build_from_cfg(transform, PIPELINES)
+    results_wo_shear = shear_module(copy.deepcopy(results))
+    check_result_same(results, results_wo_shear)
+
+    # test case when no shear aug (level=0, direction='vertical')
+    transform = dict(
+        type='Shear',
+        level=0,
+        prob=1.,
+        img_fill_val=img_fill_val,
+        seg_ignore_label=seg_ignore_label,
+        direction='vertical')
+    shear_module = build_from_cfg(transform, PIPELINES)
+    results_wo_shear = shear_module(copy.deepcopy(results))
+    check_result_same(results, results_wo_shear)
+
+    # test case when no shear aug (prob<=0)
+    transform = dict(
+        type='Shear',
+        level=10,
+        prob=0.,
+        img_fill_val=img_fill_val,
+        direction='vertical')
+    shear_module = build_from_cfg(transform, PIPELINES)
+    results_wo_shear = shear_module(copy.deepcopy(results))
+    check_result_same(results, results_wo_shear)
+
+    # test shear horizontally, magnitude=1
+    transform = dict(
+        type='Shear',
+        level=10,
+        prob=1.,
+        img_fill_val=img_fill_val,
+        direction='horizontal',
+        max_shear_magnitude=1.,
+        random_negative_prob=0.)
+    shear_module = build_from_cfg(transform, PIPELINES)
+    results_sheared = shear_module(copy.deepcopy(results))
+    results_gt = copy.deepcopy(results)
+    img_s = np.array([[1, 2, 3, 4], [0, 5, 6, 7]], dtype=np.uint8)
+    img_s = np.stack([img_s, img_s, img_s], axis=-1)
+    img_s[1, 0, :] = np.array(img_fill_val)
+    results_gt['img'] = img_s
+    results_gt['gt_bboxes'] = np.array([[0., 0., 3., 1.]], dtype=np.float32)
+    results_gt['gt_bboxes_ignore'] = np.array([[2., 0., 4., 1.]],
+                                              dtype=np.float32)
+    gt_masks = np.array([[0, 1, 1, 0], [0, 0, 1, 0]],
+                        dtype=np.uint8)[None, :, :]
+    results_gt['gt_masks'] = BitmapMasks(gt_masks, 2, 4)
+    results_gt['gt_semantic_seg'] = np.array(
+        [[1, 2, 3, 4], [255, 5, 6, 7]], dtype=results['gt_semantic_seg'].dtype)
+    check_result_same(results_gt, results_sheared)
+
+    # test PolygonMasks with shear horizontally, magnitude=1
+    results = construct_toy_data(poly2mask=False)
+    results_sheared = shear_module(copy.deepcopy(results))
+    print(results_sheared['gt_masks'])
+    gt_masks = [[np.array([0, 0, 2, 0, 3, 1, 1, 1], dtype=np.float)]]
+    results_gt['gt_masks'] = PolygonMasks(gt_masks, 2, 4)
+    check_result_same(results_gt, results_sheared)
+
+    # test shear vertically, magnitude=-1
+    img_fill_val = 128
+    results = construct_toy_data()
+    transform = dict(
+        type='Shear',
+        level=10,
+        prob=1.,
+        img_fill_val=img_fill_val,
+        direction='vertical',
+        max_shear_magnitude=1.,
+        random_negative_prob=1.)
+    shear_module = build_from_cfg(transform, PIPELINES)
+    results_sheared = shear_module(copy.deepcopy(results))
+    results_gt = copy.deepcopy(results)
+    img_s = np.array([[1, 6, img_fill_val, img_fill_val],
+                      [5, img_fill_val, img_fill_val, img_fill_val]],
+                     dtype=np.uint8)
+    img_s = np.stack([img_s, img_s, img_s], axis=-1)
+    results_gt['img'] = img_s
+    results_gt['gt_bboxes'] = np.empty((0, 4), dtype=np.float32)
+    results_gt['gt_labels'] = np.empty((0, ), dtype=np.int64)
+    results_gt['gt_bboxes_ignore'] = np.empty((0, 4), dtype=np.float32)
+    gt_masks = np.array([[0, 1, 0, 0], [0, 0, 0, 0]],
+                        dtype=np.uint8)[None, :, :]
+    results_gt['gt_masks'] = BitmapMasks(gt_masks, 2, 4)
+    results_gt['gt_semantic_seg'] = np.array(
+        [[1, 6, 255, 255], [5, 255, 255, 255]],
+        dtype=results['gt_semantic_seg'].dtype)
+    check_result_same(results_gt, results_sheared)
+
+    # test PolygonMasks with shear vertically, magnitude=-1
+    results = construct_toy_data(poly2mask=False)
+    results_sheared = shear_module(copy.deepcopy(results))
+    gt_masks = [[np.array([0, 0, 2, 0, 2, 0, 0, 1], dtype=np.float)]]
+    results_gt['gt_masks'] = PolygonMasks(gt_masks, 2, 4)
+    check_result_same(results_gt, results_sheared)
+
+    results = construct_toy_data()
+    # same mask for BitmapMasks and PolygonMasks
+    results['gt_masks'] = BitmapMasks(
+        np.array([[0, 1, 1, 0], [0, 1, 1, 0]], dtype=np.uint8)[None, :, :], 2,
+        4)
+    results['gt_bboxes'] = np.array([[1., 0., 2., 1.]], dtype=np.float32)
+    results_sheared_bitmap = shear_module(copy.deepcopy(results))
+    check_result_same(results_sheared_bitmap, results_sheared)
+
+    # test AutoAugment equipped with Shear
+    policies = [[dict(type='Shear', level=10, prob=1.)]]
+    autoaug = dict(type='AutoAugment', policies=policies)
+    autoaug_module = build_from_cfg(autoaug, PIPELINES)
+    autoaug_module(copy.deepcopy(results))
+
+    policies = [[
+        dict(type='Shear', level=10, prob=1.),
+        dict(
+            type='Shear',
+            level=8,
+            img_fill_val=img_fill_val,
+            direction='vertical',
+            max_shear_magnitude=1.)
+    ]]
+    autoaug = dict(type='AutoAugment', policies=policies)
+    autoaug_module = build_from_cfg(autoaug, PIPELINES)
+    autoaug_module(copy.deepcopy(results))
diff --git a/tests/test_data/test_pipelines/test_transform/test_transform.py b/tests/test_data/test_pipelines/test_transform/test_transform.py
new file mode 100755
index 0000000..1ebc4f3
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/test_transform.py
@@ -0,0 +1,1118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+
+import mmcv
+import numpy as np
+import pytest
+import torch
+from mmcv.utils import build_from_cfg
+
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.datasets.builder import PIPELINES
+from .utils import create_full_masks, create_random_bboxes
+
+
+def test_resize():
+    # test assertion if img_scale is a list
+    with pytest.raises(AssertionError):
+        transform = dict(type='Resize', img_scale=[1333, 800], keep_ratio=True)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion if len(img_scale) while ratio_range is not None
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='Resize',
+            img_scale=[(1333, 800), (1333, 600)],
+            ratio_range=(0.9, 1.1),
+            keep_ratio=True)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid multiscale_mode
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='Resize',
+            img_scale=[(1333, 800), (1333, 600)],
+            keep_ratio=True,
+            multiscale_mode='2333')
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion if both scale and scale_factor are set
+    with pytest.raises(AssertionError):
+        results = dict(
+            img_prefix=osp.join(osp.dirname(__file__), '../../../data'),
+            img_info=dict(filename='color.jpg'))
+        load = dict(type='LoadImageFromFile')
+        load = build_from_cfg(load, PIPELINES)
+        transform = dict(type='Resize', img_scale=(1333, 800), keep_ratio=True)
+        transform = build_from_cfg(transform, PIPELINES)
+        results = load(results)
+        results['scale'] = (1333, 800)
+        results['scale_factor'] = 1.0
+        results = transform(results)
+
+    transform = dict(type='Resize', img_scale=(1333, 800), keep_ratio=True)
+    resize_module = build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+    results['img2'] = copy.deepcopy(img)
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['img_fields'] = ['img', 'img2']
+
+    results = resize_module(results)
+    assert np.equal(results['img'], results['img2']).all()
+
+    results.pop('scale')
+    results.pop('scale_factor')
+    transform = dict(
+        type='Resize',
+        img_scale=(1280, 800),
+        multiscale_mode='value',
+        keep_ratio=False)
+    resize_module = build_from_cfg(transform, PIPELINES)
+    results = resize_module(results)
+    assert np.equal(results['img'], results['img2']).all()
+    assert results['img_shape'] == (800, 1280, 3)
+    assert results['img'].dtype == results['img'].dtype == np.uint8
+
+    results_seg = {
+        'img': img,
+        'img_shape': img.shape,
+        'ori_shape': img.shape,
+        'gt_semantic_seg': copy.deepcopy(img),
+        'gt_seg': copy.deepcopy(img),
+        'seg_fields': ['gt_semantic_seg', 'gt_seg']
+    }
+    transform = dict(
+        type='Resize',
+        img_scale=(640, 400),
+        multiscale_mode='value',
+        keep_ratio=False)
+    resize_module = build_from_cfg(transform, PIPELINES)
+    results_seg = resize_module(results_seg)
+    assert results_seg['gt_semantic_seg'].shape == results_seg['gt_seg'].shape
+    assert results_seg['img_shape'] == (400, 640, 3)
+    assert results_seg['img_shape'] != results_seg['ori_shape']
+    assert results_seg['gt_semantic_seg'].shape == results_seg['img_shape']
+    assert np.equal(results_seg['gt_semantic_seg'],
+                    results_seg['gt_seg']).all()
+
+
+def test_flip():
+    # test assertion for invalid flip_ratio
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomFlip', flip_ratio=1.5)
+        build_from_cfg(transform, PIPELINES)
+    # test assertion for 0 <= sum(flip_ratio) <= 1
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomFlip',
+            flip_ratio=[0.7, 0.8],
+            direction=['horizontal', 'vertical'])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for mismatch between number of flip_ratio and direction
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomFlip', flip_ratio=[0.4, 0.5])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid direction
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomFlip', flip_ratio=1., direction='horizonta')
+        build_from_cfg(transform, PIPELINES)
+
+    transform = dict(type='RandomFlip', flip_ratio=1.)
+    flip_module = build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img2'] = copy.deepcopy(img)
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+    results['img_fields'] = ['img', 'img2']
+
+    results = flip_module(results)
+    assert np.equal(results['img'], results['img2']).all()
+
+    flip_module = build_from_cfg(transform, PIPELINES)
+    results = flip_module(results)
+    assert np.equal(results['img'], results['img2']).all()
+    assert np.equal(original_img, results['img']).all()
+
+    # test flip_ratio is float, direction is list
+    transform = dict(
+        type='RandomFlip',
+        flip_ratio=0.9,
+        direction=['horizontal', 'vertical', 'diagonal'])
+    flip_module = build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+    results['img_fields'] = ['img']
+    results = flip_module(results)
+    if results['flip']:
+        assert np.array_equal(
+            mmcv.imflip(original_img, results['flip_direction']),
+            results['img'])
+    else:
+        assert np.array_equal(original_img, results['img'])
+
+    # test flip_ratio is list, direction is list
+    transform = dict(
+        type='RandomFlip',
+        flip_ratio=[0.3, 0.3, 0.2],
+        direction=['horizontal', 'vertical', 'diagonal'])
+    flip_module = build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+    results['img_fields'] = ['img']
+    results = flip_module(results)
+    if results['flip']:
+        assert np.array_equal(
+            mmcv.imflip(original_img, results['flip_direction']),
+            results['img'])
+    else:
+        assert np.array_equal(original_img, results['img'])
+
+
+def test_random_crop():
+    # test assertion for invalid random crop
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomCrop', crop_size=(-1, 0))
+        build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # TODO: add img_fields test
+    results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    h, w, _ = img.shape
+    gt_bboxes = create_random_bboxes(8, w, h)
+    gt_bboxes_ignore = create_random_bboxes(2, w, h)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = gt_bboxes_ignore
+    transform = dict(type='RandomCrop', crop_size=(h - 20, w - 20))
+    crop_module = build_from_cfg(transform, PIPELINES)
+    results = crop_module(results)
+    assert results['img'].shape[:2] == (h - 20, w - 20)
+    # All bboxes should be reserved after crop
+    assert results['img_shape'][:2] == (h - 20, w - 20)
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes'].shape[0] == 8
+    assert results['gt_bboxes_ignore'].shape[0] == 2
+
+    def area(bboxes):
+        return np.prod(bboxes[:, 2:4] - bboxes[:, 0:2], axis=1)
+
+    assert (area(results['gt_bboxes']) <= area(gt_bboxes)).all()
+    assert (area(results['gt_bboxes_ignore']) <= area(gt_bboxes_ignore)).all()
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+    # test assertion for invalid crop_type
+    with pytest.raises(ValueError):
+        transform = dict(
+            type='RandomCrop', crop_size=(1, 1), crop_type='unknown')
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid crop_size
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCrop', crop_type='relative', crop_size=(0, 0))
+        build_from_cfg(transform, PIPELINES)
+
+    def _construct_toy_data():
+        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        results = dict()
+        # image
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['img_fields'] = ['img']
+        # bboxes
+        results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+        results['gt_bboxes'] = np.array([[0., 0., 2., 1.]], dtype=np.float32)
+        results['gt_bboxes_ignore'] = np.array([[2., 0., 3., 1.]],
+                                               dtype=np.float32)
+        # labels
+        results['gt_labels'] = np.array([1], dtype=np.int64)
+        return results
+
+    # test crop_type "relative_range"
+    results = _construct_toy_data()
+    transform = dict(
+        type='RandomCrop',
+        crop_type='relative_range',
+        crop_size=(0.3, 0.7),
+        allow_negative_crop=True)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    h, w = results_transformed['img_shape'][:2]
+    assert int(2 * 0.3 + 0.5) <= h <= int(2 * 1 + 0.5)
+    assert int(4 * 0.7 + 0.5) <= w <= int(4 * 1 + 0.5)
+    assert results_transformed['gt_bboxes'].dtype == np.float32
+    assert results_transformed['gt_bboxes_ignore'].dtype == np.float32
+
+    # test crop_type "relative"
+    transform = dict(
+        type='RandomCrop',
+        crop_type='relative',
+        crop_size=(0.3, 0.7),
+        allow_negative_crop=True)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    h, w = results_transformed['img_shape'][:2]
+    assert h == int(2 * 0.3 + 0.5) and w == int(4 * 0.7 + 0.5)
+    assert results_transformed['gt_bboxes'].dtype == np.float32
+    assert results_transformed['gt_bboxes_ignore'].dtype == np.float32
+
+    # test crop_type "absolute"
+    transform = dict(
+        type='RandomCrop',
+        crop_type='absolute',
+        crop_size=(1, 2),
+        allow_negative_crop=True)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    h, w = results_transformed['img_shape'][:2]
+    assert h == 1 and w == 2
+    assert results_transformed['gt_bboxes'].dtype == np.float32
+    assert results_transformed['gt_bboxes_ignore'].dtype == np.float32
+
+    # test crop_type "absolute_range"
+    transform = dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=(1, 20),
+        allow_negative_crop=True)
+    transform_module = build_from_cfg(transform, PIPELINES)
+    results_transformed = transform_module(copy.deepcopy(results))
+    h, w = results_transformed['img_shape'][:2]
+    assert 1 <= h <= 2 and 1 <= w <= 4
+    assert results_transformed['gt_bboxes'].dtype == np.float32
+    assert results_transformed['gt_bboxes_ignore'].dtype == np.float32
+
+
+def test_min_iou_random_crop():
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+    h, w, _ = img.shape
+    gt_bboxes = create_random_bboxes(1, w, h)
+    gt_bboxes_ignore = create_random_bboxes(1, w, h)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = gt_bboxes_ignore
+    transform = dict(type='MinIoURandomCrop')
+    crop_module = build_from_cfg(transform, PIPELINES)
+
+    # Test for img_fields
+    results_test = copy.deepcopy(results)
+    results_test['img1'] = results_test['img']
+    results_test['img_fields'] = ['img', 'img1']
+    with pytest.raises(AssertionError):
+        crop_module(results_test)
+    results = crop_module(results)
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+    patch = np.array([0, 0, results['img_shape'][1], results['img_shape'][0]])
+    ious = bbox_overlaps(patch.reshape(-1, 4),
+                         results['gt_bboxes']).reshape(-1)
+    ious_ignore = bbox_overlaps(
+        patch.reshape(-1, 4), results['gt_bboxes_ignore']).reshape(-1)
+    mode = crop_module.mode
+    if mode == 1:
+        assert np.equal(results['gt_bboxes'], gt_bboxes).all()
+        assert np.equal(results['gt_bboxes_ignore'], gt_bboxes_ignore).all()
+    else:
+        assert (ious >= mode).all()
+        assert (ious_ignore >= mode).all()
+
+
+def test_pad():
+    # test assertion if both size_divisor and size is None
+    with pytest.raises(AssertionError):
+        transform = dict(type='Pad')
+        build_from_cfg(transform, PIPELINES)
+
+    transform = dict(type='Pad', size_divisor=32)
+    transform = build_from_cfg(transform, PIPELINES)
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img2'] = copy.deepcopy(img)
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+    results['img_fields'] = ['img', 'img2']
+
+    results = transform(results)
+    assert np.equal(results['img'], results['img2']).all()
+    # original img already divisible by 32
+    assert np.equal(results['img'], original_img).all()
+    img_shape = results['img'].shape
+    assert img_shape[0] % 32 == 0
+    assert img_shape[1] % 32 == 0
+
+    resize_transform = dict(
+        type='Resize', img_scale=(1333, 800), keep_ratio=True)
+    resize_module = build_from_cfg(resize_transform, PIPELINES)
+    results = resize_module(results)
+    results = transform(results)
+    img_shape = results['img'].shape
+    assert np.equal(results['img'], results['img2']).all()
+    assert img_shape[0] % 32 == 0
+    assert img_shape[1] % 32 == 0
+
+    # test the size and size_divisor must be None when pad2square is True
+    with pytest.raises(AssertionError):
+        transform = dict(type='Pad', size_divisor=32, pad_to_square=True)
+        build_from_cfg(transform, PIPELINES)
+
+    transform = dict(type='Pad', pad_to_square=True)
+    transform = build_from_cfg(transform, PIPELINES)
+    results['img'] = img
+    results = transform(results)
+    assert results['img'].shape[0] == results['img'].shape[1]
+
+    # test the pad_val is converted to a dict
+    transform = dict(type='Pad', size_divisor=32, pad_val=0)
+    with pytest.deprecated_call():
+        transform = build_from_cfg(transform, PIPELINES)
+
+    assert isinstance(transform.pad_val, dict)
+    results = transform(results)
+    img_shape = results['img'].shape
+    assert img_shape[0] % 32 == 0
+    assert img_shape[1] % 32 == 0
+
+
+def test_normalize():
+    img_norm_cfg = dict(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True)
+    transform = dict(type='Normalize', **img_norm_cfg)
+    transform = build_from_cfg(transform, PIPELINES)
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img2'] = copy.deepcopy(img)
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+    results['img_fields'] = ['img', 'img2']
+
+    results = transform(results)
+    assert np.equal(results['img'], results['img2']).all()
+
+    mean = np.array(img_norm_cfg['mean'])
+    std = np.array(img_norm_cfg['std'])
+    converted_img = (original_img[..., ::-1] - mean) / std
+    assert np.allclose(results['img'], converted_img)
+
+
+def test_albu_transform():
+    results = dict(
+        img_prefix=osp.join(osp.dirname(__file__), '../../../data'),
+        img_info=dict(filename='color.jpg'))
+
+    # Define simple pipeline
+    load = dict(type='LoadImageFromFile')
+    load = build_from_cfg(load, PIPELINES)
+
+    albu_transform = dict(
+        type='Albu', transforms=[dict(type='ChannelShuffle', p=1)])
+    albu_transform = build_from_cfg(albu_transform, PIPELINES)
+
+    normalize = dict(type='Normalize', mean=[0] * 3, std=[0] * 3, to_rgb=True)
+    normalize = build_from_cfg(normalize, PIPELINES)
+
+    # Execute transforms
+    results = load(results)
+    results = albu_transform(results)
+    results = normalize(results)
+
+    assert results['img'].dtype == np.float32
+
+
+def test_random_center_crop_pad():
+    # test assertion for invalid crop_size while test_mode=False
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCenterCropPad',
+            crop_size=(-1, 0),
+            test_mode=False,
+            test_pad_mode=None)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid ratios while test_mode=False
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCenterCropPad',
+            crop_size=(511, 511),
+            ratios=(1.0),
+            test_mode=False,
+            test_pad_mode=None)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid mean, std and to_rgb
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCenterCropPad',
+            crop_size=(511, 511),
+            mean=None,
+            std=None,
+            to_rgb=None,
+            test_mode=False,
+            test_pad_mode=None)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid crop_size while test_mode=True
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCenterCropPad',
+            crop_size=(511, 511),
+            ratios=None,
+            border=None,
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True,
+            test_mode=True,
+            test_pad_mode=('logical_or', 127))
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid ratios while test_mode=True
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCenterCropPad',
+            crop_size=None,
+            ratios=(0.9, 1.0, 1.1),
+            border=None,
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True,
+            test_mode=True,
+            test_pad_mode=('logical_or', 127))
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid border while test_mode=True
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCenterCropPad',
+            crop_size=None,
+            ratios=None,
+            border=128,
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True,
+            test_mode=True,
+            test_pad_mode=('logical_or', 127))
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid test_pad_mode while test_mode=True
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCenterCropPad',
+            crop_size=None,
+            ratios=None,
+            border=None,
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True,
+            test_mode=True,
+            test_pad_mode=('do_nothing', 100))
+        build_from_cfg(transform, PIPELINES)
+
+    results = dict(
+        img_prefix=osp.join(osp.dirname(__file__), '../../../data'),
+        img_info=dict(filename='color.jpg'))
+
+    load = dict(type='LoadImageFromFile', to_float32=True)
+    load = build_from_cfg(load, PIPELINES)
+    results = load(results)
+    test_results = copy.deepcopy(results)
+
+    h, w, _ = results['img_shape']
+    gt_bboxes = create_random_bboxes(8, w, h)
+    gt_bboxes_ignore = create_random_bboxes(2, w, h)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = gt_bboxes_ignore
+    train_transform = dict(
+        type='RandomCenterCropPad',
+        crop_size=(h - 20, w - 20),
+        ratios=(1.0, ),
+        border=128,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True,
+        test_mode=False,
+        test_pad_mode=None)
+    crop_module = build_from_cfg(train_transform, PIPELINES)
+    train_results = crop_module(results)
+    assert train_results['img'].shape[:2] == (h - 20, w - 20)
+    # All bboxes should be reserved after crop
+    assert train_results['pad_shape'][:2] == (h - 20, w - 20)
+    assert train_results['gt_bboxes'].shape[0] == 8
+    assert train_results['gt_bboxes_ignore'].shape[0] == 2
+    assert train_results['gt_bboxes'].dtype == np.float32
+    assert train_results['gt_bboxes_ignore'].dtype == np.float32
+
+    test_transform = dict(
+        type='RandomCenterCropPad',
+        crop_size=None,
+        ratios=None,
+        border=None,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True,
+        test_mode=True,
+        test_pad_mode=('logical_or', 127))
+    crop_module = build_from_cfg(test_transform, PIPELINES)
+
+    test_results = crop_module(test_results)
+    assert test_results['img'].shape[:2] == (h | 127, w | 127)
+    assert test_results['pad_shape'][:2] == (h | 127, w | 127)
+    assert 'border' in test_results
+
+
+def test_multi_scale_flip_aug():
+    # test assertion if give both scale_factor and img_scale
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='MultiScaleFlipAug',
+            scale_factor=1.0,
+            img_scale=[(1333, 800)],
+            transforms=[dict(type='Resize')])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion if both scale_factor and img_scale are None
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='MultiScaleFlipAug',
+            scale_factor=None,
+            img_scale=None,
+            transforms=[dict(type='Resize')])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion if img_scale is not tuple or list of tuple
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='MultiScaleFlipAug',
+            img_scale=[1333, 800],
+            transforms=[dict(type='Resize')])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion if flip_direction is not str or list of str
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='MultiScaleFlipAug',
+            img_scale=[(1333, 800)],
+            flip_direction=1,
+            transforms=[dict(type='Resize')])
+        build_from_cfg(transform, PIPELINES)
+
+    scale_transform = dict(
+        type='MultiScaleFlipAug',
+        img_scale=[(1333, 800), (1333, 640)],
+        transforms=[dict(type='Resize', keep_ratio=True)])
+    transform = build_from_cfg(scale_transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['img_fields'] = ['img']
+
+    scale_results = transform(copy.deepcopy(results))
+    assert len(scale_results['img']) == 2
+    assert scale_results['img'][0].shape == (750, 1333, 3)
+    assert scale_results['img_shape'][0] == (750, 1333, 3)
+    assert scale_results['img'][1].shape == (640, 1138, 3)
+    assert scale_results['img_shape'][1] == (640, 1138, 3)
+
+    scale_factor_transform = dict(
+        type='MultiScaleFlipAug',
+        scale_factor=[0.8, 1.0, 1.2],
+        transforms=[dict(type='Resize', keep_ratio=False)])
+    transform = build_from_cfg(scale_factor_transform, PIPELINES)
+    scale_factor_results = transform(copy.deepcopy(results))
+    assert len(scale_factor_results['img']) == 3
+    assert scale_factor_results['img'][0].shape == (230, 409, 3)
+    assert scale_factor_results['img_shape'][0] == (230, 409, 3)
+    assert scale_factor_results['img'][1].shape == (288, 512, 3)
+    assert scale_factor_results['img_shape'][1] == (288, 512, 3)
+    assert scale_factor_results['img'][2].shape == (345, 614, 3)
+    assert scale_factor_results['img_shape'][2] == (345, 614, 3)
+
+    # test pipeline of coco_detection
+    results = dict(
+        img_prefix=osp.join(osp.dirname(__file__), '../../../data'),
+        img_info=dict(filename='color.jpg'))
+    load_cfg, multi_scale_cfg = mmcv.Config.fromfile(
+        'configs/_base_/datasets/coco_detection.py').test_pipeline
+    load = build_from_cfg(load_cfg, PIPELINES)
+    transform = build_from_cfg(multi_scale_cfg, PIPELINES)
+    results = transform(load(results))
+    assert len(results['img']) == 1
+    assert len(results['img_metas']) == 1
+    assert isinstance(results['img'][0], torch.Tensor)
+    assert isinstance(results['img_metas'][0], mmcv.parallel.DataContainer)
+    assert results['img_metas'][0].data['ori_shape'] == (288, 512, 3)
+    assert results['img_metas'][0].data['img_shape'] == (750, 1333, 3)
+    assert results['img_metas'][0].data['pad_shape'] == (768, 1344, 3)
+    assert results['img_metas'][0].data['scale_factor'].tolist() == [
+        2.603515625, 2.6041667461395264, 2.603515625, 2.6041667461395264
+    ]
+
+
+def test_cutout():
+    # test n_holes
+    with pytest.raises(AssertionError):
+        transform = dict(type='CutOut', n_holes=(5, 3), cutout_shape=(8, 8))
+        build_from_cfg(transform, PIPELINES)
+    with pytest.raises(AssertionError):
+        transform = dict(type='CutOut', n_holes=(3, 4, 5), cutout_shape=(8, 8))
+        build_from_cfg(transform, PIPELINES)
+    # test cutout_shape and cutout_ratio
+    with pytest.raises(AssertionError):
+        transform = dict(type='CutOut', n_holes=1, cutout_shape=8)
+        build_from_cfg(transform, PIPELINES)
+    with pytest.raises(AssertionError):
+        transform = dict(type='CutOut', n_holes=1, cutout_ratio=0.2)
+        build_from_cfg(transform, PIPELINES)
+    # either of cutout_shape and cutout_ratio should be given
+    with pytest.raises(AssertionError):
+        transform = dict(type='CutOut', n_holes=1)
+        build_from_cfg(transform, PIPELINES)
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='CutOut',
+            n_holes=1,
+            cutout_shape=(2, 2),
+            cutout_ratio=(0.4, 0.4))
+        build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    results['pad_shape'] = img.shape
+    results['img_fields'] = ['img']
+
+    transform = dict(type='CutOut', n_holes=1, cutout_shape=(10, 10))
+    cutout_module = build_from_cfg(transform, PIPELINES)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() < img.sum()
+
+    transform = dict(type='CutOut', n_holes=1, cutout_ratio=(0.8, 0.8))
+    cutout_module = build_from_cfg(transform, PIPELINES)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() < img.sum()
+
+    transform = dict(
+        type='CutOut',
+        n_holes=(2, 4),
+        cutout_shape=[(10, 10), (15, 15)],
+        fill_in=(255, 255, 255))
+    cutout_module = build_from_cfg(transform, PIPELINES)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() > img.sum()
+
+    transform = dict(
+        type='CutOut',
+        n_holes=1,
+        cutout_ratio=(0.8, 0.8),
+        fill_in=(255, 255, 255))
+    cutout_module = build_from_cfg(transform, PIPELINES)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() > img.sum()
+
+
+def test_random_shift():
+    # test assertion for invalid shift_ratio
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomShift', shift_ratio=1.5)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid max_shift_px
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomShift', max_shift_px=-1)
+        build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+    # TODO: add img_fields test
+    results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+
+    h, w, _ = img.shape
+    gt_bboxes = create_random_bboxes(8, w, h)
+    gt_bboxes_ignore = create_random_bboxes(2, w, h)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = gt_bboxes_ignore
+    transform = dict(type='RandomShift', shift_ratio=1.0)
+    random_shift_module = build_from_cfg(transform, PIPELINES)
+    results = random_shift_module(results)
+
+    assert results['img'].shape[:2] == (h, w)
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+
+def test_random_affine():
+    # test assertion for invalid translate_ratio
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomAffine', max_translate_ratio=1.5)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid scaling_ratio_range
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomAffine', scaling_ratio_range=(1.5, 0.5))
+        build_from_cfg(transform, PIPELINES)
+
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomAffine', scaling_ratio_range=(0, 0.5))
+        build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+    results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+
+    h, w, _ = img.shape
+    gt_bboxes = create_random_bboxes(8, w, h)
+    gt_bboxes_ignore = create_random_bboxes(2, w, h)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = gt_bboxes_ignore
+    transform = dict(type='RandomAffine')
+    random_affine_module = build_from_cfg(transform, PIPELINES)
+    results = random_affine_module(results)
+
+    assert results['img'].shape[:2] == (h, w)
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+    # test filter bbox
+    gt_bboxes = np.array([[0, 0, 1, 1], [0, 0, 3, 100]], dtype=np.float32)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    transform = dict(
+        type='RandomAffine',
+        max_rotate_degree=0.,
+        max_translate_ratio=0.,
+        scaling_ratio_range=(1., 1.),
+        max_shear_degree=0.,
+        border=(0, 0),
+        min_bbox_size=2,
+        max_aspect_ratio=20,
+        skip_filter=False)
+    random_affine_module = build_from_cfg(transform, PIPELINES)
+
+    results = random_affine_module(results)
+
+    assert results['gt_bboxes'].shape[0] == 0
+    assert results['gt_labels'].shape[0] == 0
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+
+def test_mosaic():
+    # test assertion for invalid img_scale
+    with pytest.raises(AssertionError):
+        transform = dict(type='Mosaic', img_scale=640)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid probability
+    with pytest.raises(AssertionError):
+        transform = dict(type='Mosaic', prob=1.5)
+        build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+    # TODO: add img_fields test
+    results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+
+    h, w, _ = img.shape
+    gt_bboxes = create_random_bboxes(8, w, h)
+    gt_bboxes_ignore = create_random_bboxes(2, w, h)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = gt_bboxes_ignore
+    transform = dict(type='Mosaic', img_scale=(10, 12))
+    mosaic_module = build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid mix_results
+    with pytest.raises(AssertionError):
+        mosaic_module(results)
+
+    results['mix_results'] = [copy.deepcopy(results)] * 3
+    results = mosaic_module(results)
+    assert results['img'].shape[:2] == (20, 24)
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+
+def test_mixup():
+    # test assertion for invalid img_scale
+    with pytest.raises(AssertionError):
+        transform = dict(type='MixUp', img_scale=640)
+        build_from_cfg(transform, PIPELINES)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    results['img'] = img
+    # TODO: add img_fields test
+    results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+
+    h, w, _ = img.shape
+    gt_bboxes = create_random_bboxes(8, w, h)
+    gt_bboxes_ignore = create_random_bboxes(2, w, h)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = gt_bboxes_ignore
+    transform = dict(type='MixUp', img_scale=(10, 12))
+    mixup_module = build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid mix_results
+    with pytest.raises(AssertionError):
+        mixup_module(results)
+
+    with pytest.raises(AssertionError):
+        results['mix_results'] = [copy.deepcopy(results)] * 2
+        mixup_module(results)
+
+    results['mix_results'] = [copy.deepcopy(results)]
+    results = mixup_module(results)
+    assert results['img'].shape[:2] == (288, 512)
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+    # test filter bbox :
+    # 2 boxes with sides 1 and 3 are filtered as min_bbox_size=5
+    gt_bboxes = np.array([[0, 0, 1, 1], [0, 0, 3, 3]], dtype=np.float32)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = np.array([], dtype=np.float32)
+    mixresults = results['mix_results'][0]
+    mixresults['gt_labels'] = copy.deepcopy(results['gt_labels'])
+    mixresults['gt_bboxes'] = copy.deepcopy(results['gt_bboxes'])
+    mixresults['gt_bboxes_ignore'] = copy.deepcopy(results['gt_bboxes_ignore'])
+    transform = dict(
+        type='MixUp',
+        img_scale=(10, 12),
+        ratio_range=(1.5, 1.5),
+        min_bbox_size=5,
+        skip_filter=False)
+    mixup_module = build_from_cfg(transform, PIPELINES)
+
+    results = mixup_module(results)
+
+    assert results['gt_bboxes'].shape[0] == 2
+    assert results['gt_labels'].shape[0] == 2
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
+
+def test_photo_metric_distortion():
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    transform = dict(type='PhotoMetricDistortion')
+    distortion_module = build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid img_fields
+    with pytest.raises(AssertionError):
+        results = dict()
+        results['img'] = img
+        results['img2'] = img
+        results['img_fields'] = ['img', 'img2']
+        distortion_module(results)
+
+    # test uint8 input
+    results = dict()
+    results['img'] = img
+    results = distortion_module(results)
+    assert results['img'].dtype == np.float32
+
+    # test float32 input
+    results = dict()
+    results['img'] = img.astype(np.float32)
+    results = distortion_module(results)
+    assert results['img'].dtype == np.float32
+
+
+def test_copypaste():
+    dst_results, src_results = dict(), dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../../../data/color.jpg'), 'color')
+    dst_results['img'] = img.copy()
+    src_results['img'] = img.copy()
+
+    h, w, _ = img.shape
+
+    dst_bboxes = np.array([[0.2 * w, 0.2 * h, 0.4 * w, 0.4 * h],
+                           [0.5 * w, 0.5 * h, 0.6 * w, 0.6 * h]],
+                          dtype=np.float32)
+    src_bboxes = np.array([[0.1 * w, 0.1 * h, 0.3 * w, 0.5 * h],
+                           [0.4 * w, 0.4 * h, 0.7 * w, 0.7 * h],
+                           [0.8 * w, 0.8 * h, 0.9 * w, 0.9 * h]],
+                          dtype=np.float32)
+    dst_labels = np.ones(dst_bboxes.shape[0], dtype=np.int64)
+    src_labels = np.ones(src_bboxes.shape[0], dtype=np.int64) * 2
+    dst_masks = create_full_masks(dst_bboxes, w, h)
+    src_masks = create_full_masks(src_bboxes, w, h)
+    dst_results['gt_bboxes'] = dst_bboxes.copy()
+    src_results['gt_bboxes'] = src_bboxes.copy()
+    dst_results['gt_labels'] = dst_labels.copy()
+    src_results['gt_labels'] = src_labels.copy()
+    dst_results['gt_masks'] = copy.deepcopy(dst_masks)
+    src_results['gt_masks'] = copy.deepcopy(src_masks)
+
+    results = copy.deepcopy(dst_results)
+
+    transform = dict(type='CopyPaste', selected=False)
+    copypaste_module = build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid mix_results
+    with pytest.raises(AssertionError):
+        copypaste_module(results)
+
+    results['mix_results'] = [copy.deepcopy(src_results)]
+    results = copypaste_module(results)
+    assert results['img'].shape[:2] == (h, w)
+    # one object of destination image is totally occluded
+    assert results['gt_bboxes'].shape[0] == \
+           dst_bboxes.shape[0] + src_bboxes.shape[0] - 1
+    assert results['gt_labels'].shape[0] == \
+           dst_labels.shape[0] + src_labels.shape[0] - 1
+    assert results['gt_masks'].masks.shape[0] == \
+           dst_masks.masks.shape[0] + src_masks.masks.shape[0] - 1
+
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    # the object of destination image is partially occluded
+    ori_bbox = dst_bboxes[0]
+    occ_bbox = results['gt_bboxes'][0]
+    ori_mask = dst_masks.masks[0]
+    occ_mask = results['gt_masks'].masks[0]
+    assert ori_mask.sum() > occ_mask.sum()
+    assert np.all(np.abs(occ_bbox - ori_bbox) <=
+                  copypaste_module.bbox_occluded_thr) or \
+        occ_mask.sum() > copypaste_module.mask_occluded_thr
+    # test copypaste with selected objects
+    transform = dict(type='CopyPaste')
+    copypaste_module = build_from_cfg(transform, PIPELINES)
+    results = copy.deepcopy(dst_results)
+    results['mix_results'] = [copy.deepcopy(src_results)]
+    copypaste_module(results)
+    # test copypaste with an empty source image
+    results = copy.deepcopy(dst_results)
+    valid_inds = [False] * src_bboxes.shape[0]
+    src_results['gt_bboxes'] = src_bboxes[valid_inds]
+    src_results['gt_labels'] = src_labels[valid_inds]
+    src_results['gt_masks'] = src_masks[valid_inds]
+    results['mix_results'] = [copy.deepcopy(src_results)]
+    copypaste_module(results)
+    # test copy_paste based on bbox
+    dst_results.pop('gt_masks')
+    src_results.pop('gt_masks')
+    dst_bboxes = dst_results['gt_bboxes']
+    src_bboxes = src_results['gt_bboxes']
+    dst_masks = create_full_masks(dst_bboxes, w, h)
+    src_masks = create_full_masks(src_bboxes, w, h)
+    results = copy.deepcopy(dst_results)
+    results['mix_results'] = [copy.deepcopy(src_results)]
+    results = copypaste_module(results)
+    result_masks = create_full_masks(results['gt_bboxes'], w, h)
+    result_masks_np = np.where(result_masks.to_ndarray().sum(0) > 0, 1, 0)
+    masks_np = np.where(
+        (src_masks.to_ndarray().sum(0) + dst_masks.to_ndarray().sum(0)) > 0, 1,
+        0)
+    assert np.all(result_masks_np == masks_np)
+    assert 'gt_masks' not in results
diff --git a/tests/test_data/test_pipelines/test_transform/test_translate.py b/tests/test_data/test_pipelines/test_transform/test_translate.py
new file mode 100755
index 0000000..8a1f9dd
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/test_translate.py
@@ -0,0 +1,516 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import pycocotools.mask as maskUtils
+import pytest
+from mmcv.utils import build_from_cfg
+
+from mmdet.core.mask import BitmapMasks, PolygonMasks
+from mmdet.datasets.builder import PIPELINES
+
+
+def _check_keys(results, results_translated):
+    assert len(set(results.keys()).difference(set(
+        results_translated.keys()))) == 0
+    assert len(set(results_translated.keys()).difference(set(
+        results.keys()))) == 0
+
+
+def _pad(h, w, c, pad_val, axis=-1, dtype=np.float32):
+    assert isinstance(pad_val, (int, float, tuple))
+    if isinstance(pad_val, (int, float)):
+        pad_val = tuple([pad_val] * c)
+    assert len(pad_val) == c
+    pad_data = np.stack([np.ones((h, w)) * pad_val[i] for i in range(c)],
+                        axis=axis).astype(dtype)
+    return pad_data
+
+
+def _construct_img(results):
+    h, w = results['img_info']['height'], results['img_info']['width']
+    img = np.random.uniform(0, 1, (h, w, 3)) * 255
+    img = img.astype(np.uint8)
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    results['img_fields'] = ['img']
+
+
+def _construct_ann_info(h=427, w=640, c=3):
+    bboxes = np.array(
+        [[222.62, 217.82, 241.81, 238.93], [50.5, 329.7, 130.23, 384.96],
+         [175.47, 331.97, 254.8, 389.26]],
+        dtype=np.float32)
+    labels = np.array([9, 2, 2], dtype=np.int64)
+    bboxes_ignore = np.array([[59., 253., 311., 337.]], dtype=np.float32)
+    masks = [
+        [[222.62, 217.82, 222.62, 238.93, 241.81, 238.93, 240.85, 218.78]],
+        [[
+            69.19, 332.17, 82.39, 330.25, 97.24, 329.7, 114.01, 331.35, 116.76,
+            337.39, 119.78, 343.17, 128.03, 344.54, 128.86, 347.84, 124.18,
+            350.59, 129.96, 358.01, 130.23, 366.54, 129.13, 377.81, 125.28,
+            382.48, 119.78, 381.93, 117.31, 377.54, 116.21, 379.46, 114.83,
+            382.21, 107.14, 383.31, 105.49, 378.36, 77.99, 377.54, 75.79,
+            381.11, 69.74, 381.93, 66.72, 378.91, 65.07, 377.81, 63.15, 379.19,
+            62.32, 383.31, 52.7, 384.96, 50.5, 379.46, 51.32, 375.61, 51.6,
+            370.11, 51.6, 364.06, 53.52, 354.99, 56.27, 344.54, 59.57, 336.29,
+            66.45, 332.72
+        ]],
+        [[
+            175.47, 386.86, 175.87, 376.44, 177.08, 351.2, 189.1, 332.77,
+            194.31, 331.97, 236.37, 332.77, 244.79, 342.39, 246.79, 346.79,
+            248.39, 345.99, 251.6, 345.59, 254.8, 348.0, 254.8, 351.6, 250.0,
+            352.0, 250.0, 354.81, 251.6, 358.41, 251.6, 364.42, 251.6, 370.03,
+            252.8, 378.04, 252.8, 384.05, 250.8, 387.26, 246.39, 387.66,
+            245.19, 386.46, 242.38, 388.86, 233.97, 389.26, 232.77, 388.06,
+            232.77, 383.65, 195.91, 381.25, 195.91, 384.86, 191.1, 384.86,
+            187.49, 385.26, 186.69, 382.85, 184.29, 382.45, 183.09, 387.26,
+            178.68, 388.46, 176.28, 387.66
+        ]]
+    ]
+    return dict(
+        bboxes=bboxes, labels=labels, bboxes_ignore=bboxes_ignore, masks=masks)
+
+
+def _load_bboxes(results):
+    ann_info = results['ann_info']
+    results['gt_bboxes'] = ann_info['bboxes'].copy()
+    results['bbox_fields'] = ['gt_bboxes']
+    gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+    if gt_bboxes_ignore is not None:
+        results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
+        results['bbox_fields'].append('gt_bboxes_ignore')
+
+
+def _load_labels(results):
+    results['gt_labels'] = results['ann_info']['labels'].copy()
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+    if isinstance(mask_ann, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+
+def _process_polygons(polygons):
+    polygons = [np.array(p) for p in polygons]
+    valid_polygons = []
+    for polygon in polygons:
+        if len(polygon) % 2 == 0 and len(polygon) >= 6:
+            valid_polygons.append(polygon)
+    return valid_polygons
+
+
+def _load_masks(results, poly2mask=True):
+    h, w = results['img_info']['height'], results['img_info']['width']
+    gt_masks = results['ann_info']['masks']
+    if poly2mask:
+        gt_masks = BitmapMasks([_poly2mask(mask, h, w) for mask in gt_masks],
+                               h, w)
+    else:
+        gt_masks = PolygonMasks(
+            [_process_polygons(polygons) for polygons in gt_masks], h, w)
+    results['gt_masks'] = gt_masks
+    results['mask_fields'] = ['gt_masks']
+
+
+def _construct_semantic_seg(results):
+    h, w = results['img_info']['height'], results['img_info']['width']
+    seg_toy = (np.random.uniform(0, 1, (h, w)) * 255).astype(np.uint8)
+    results['gt_semantic_seg'] = seg_toy
+    results['seg_fields'] = ['gt_semantic_seg']
+
+
+def construct_toy_data(poly2mask=True):
+    img_info = dict(height=427, width=640)
+    ann_info = _construct_ann_info(h=img_info['height'], w=img_info['width'])
+    results = dict(img_info=img_info, ann_info=ann_info)
+    # construct image, similar to 'LoadImageFromFile'
+    _construct_img(results)
+    # 'LoadAnnotations' (bboxes, labels, masks, semantic_seg)
+    _load_bboxes(results)
+    _load_labels(results)
+    _load_masks(results, poly2mask)
+    _construct_semantic_seg(results)
+    return results
+
+
+def test_translate():
+    # test assertion for invalid value of level
+    with pytest.raises(AssertionError):
+        transform = dict(type='Translate', level=-1)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid type of level
+    with pytest.raises(AssertionError):
+        transform = dict(type='Translate', level=[1])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid prob
+    with pytest.raises(AssertionError):
+        transform = dict(type='Translate', level=1, prob=-0.5)
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for the num of elements in tuple img_fill_val
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='Translate', level=1, img_fill_val=(128, 128, 128, 128))
+        build_from_cfg(transform, PIPELINES)
+
+    # test ValueError for invalid type of img_fill_val
+    with pytest.raises(ValueError):
+        transform = dict(
+            type='Translate', level=1, img_fill_val=[128, 128, 128])
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid value of img_fill_val
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='Translate', level=1, img_fill_val=(128, -1, 256))
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid value of direction
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='Translate', level=1, img_fill_val=128, direction='diagonal')
+        build_from_cfg(transform, PIPELINES)
+
+    # test assertion for invalid type of max_translate_offset
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='Translate',
+            level=1,
+            img_fill_val=128,
+            max_translate_offset=(250., ))
+        build_from_cfg(transform, PIPELINES)
+
+    # construct toy data example for unit test
+    results = construct_toy_data()
+
+    def _check_bbox_mask(results,
+                         results_translated,
+                         offset,
+                         direction,
+                         min_size=0.):
+        # The key correspondence from bboxes to labels and masks.
+        bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+        def _translate_bbox(bboxes, offset, direction, max_h, max_w):
+            if direction == 'horizontal':
+                bboxes[:, 0::2] = bboxes[:, 0::2] + offset
+            elif direction == 'vertical':
+                bboxes[:, 1::2] = bboxes[:, 1::2] + offset
+            else:
+                raise ValueError
+            bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, max_w)
+            bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, max_h)
+            return bboxes
+
+        h, w, c = results_translated['img'].shape
+        for key in results_translated.get('bbox_fields', []):
+            label_key, mask_key = bbox2label[key], bbox2mask[key]
+            # check length of key
+            if label_key in results:
+                assert len(results_translated[key]) == len(
+                    results_translated[label_key])
+            if mask_key in results:
+                assert len(results_translated[key]) == len(
+                    results_translated[mask_key])
+            # construct gt_bboxes
+            gt_bboxes = _translate_bbox(
+                copy.deepcopy(results[key]), offset, direction, h, w)
+            valid_inds = (gt_bboxes[:, 2] - gt_bboxes[:, 0] > min_size) & (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1] > min_size)
+            gt_bboxes = gt_bboxes[valid_inds]
+            # check bbox
+            assert np.equal(gt_bboxes, results_translated[key]).all()
+
+            # construct gt_masks
+            if mask_key not in results:
+                # e.g. 'gt_masks_ignore'
+                continue
+            masks, masks_translated = results[mask_key].to_ndarray(
+            ), results_translated[mask_key].to_ndarray()
+            assert masks.dtype == masks_translated.dtype
+            if direction == 'horizontal':
+                masks_pad = _pad(
+                    h,
+                    abs(offset),
+                    masks.shape[0],
+                    0,
+                    axis=0,
+                    dtype=masks.dtype)
+                if offset <= 0:
+                    # left shift
+                    gt_masks = np.concatenate(
+                        (masks[:, :, -offset:], masks_pad), axis=-1)
+                else:
+                    # right shift
+                    gt_masks = np.concatenate(
+                        (masks_pad, masks[:, :, :-offset]), axis=-1)
+            else:
+                masks_pad = _pad(
+                    abs(offset),
+                    w,
+                    masks.shape[0],
+                    0,
+                    axis=0,
+                    dtype=masks.dtype)
+                if offset <= 0:
+                    # top shift
+                    gt_masks = np.concatenate(
+                        (masks[:, -offset:, :], masks_pad), axis=1)
+                else:
+                    # bottom shift
+                    gt_masks = np.concatenate(
+                        (masks_pad, masks[:, :-offset, :]), axis=1)
+            gt_masks = gt_masks[valid_inds]
+            # check masks
+            assert np.equal(gt_masks, masks_translated).all()
+
+    def _check_img_seg(results, results_translated, keys, offset, fill_val,
+                       direction):
+        for key in keys:
+            assert isinstance(results_translated[key], type(results[key]))
+            # assert type(results[key]) == type(results_translated[key])
+            data, data_translated = results[key], results_translated[key]
+            if 'mask' in key:
+                data, data_translated = data.to_ndarray(
+                ), data_translated.to_ndarray()
+            assert data.dtype == data_translated.dtype
+            if 'img' in key:
+                data, data_translated = data.transpose(
+                    (2, 0, 1)), data_translated.transpose((2, 0, 1))
+            elif 'seg' in key:
+                data, data_translated = data[None, :, :], data_translated[
+                    None, :, :]
+            c, h, w = data.shape
+            if direction == 'horizontal':
+                data_pad = _pad(
+                    h, abs(offset), c, fill_val, axis=0, dtype=data.dtype)
+                if offset <= 0:
+                    # left shift
+                    data_gt = np.concatenate((data[:, :, -offset:], data_pad),
+                                             axis=-1)
+                else:
+                    # right shift
+                    data_gt = np.concatenate((data_pad, data[:, :, :-offset]),
+                                             axis=-1)
+            else:
+                data_pad = _pad(
+                    abs(offset), w, c, fill_val, axis=0, dtype=data.dtype)
+                if offset <= 0:
+                    # top shift
+                    data_gt = np.concatenate((data[:, -offset:, :], data_pad),
+                                             axis=1)
+                else:
+                    # bottom shift
+                    data_gt = np.concatenate((data_pad, data[:, :-offset, :]),
+                                             axis=1)
+            if 'mask' in key:
+                # TODO assertion here. ``data_translated`` must be a subset
+                # (or equal) of ``data_gt``
+                pass
+            else:
+                assert np.equal(data_gt, data_translated).all()
+
+    def check_translate(results,
+                        results_translated,
+                        offset,
+                        img_fill_val,
+                        seg_ignore_label,
+                        direction,
+                        min_size=0):
+        # check keys
+        _check_keys(results, results_translated)
+        # check image
+        _check_img_seg(results, results_translated,
+                       results.get('img_fields', ['img']), offset,
+                       img_fill_val, direction)
+        # check segmentation map
+        _check_img_seg(results, results_translated,
+                       results.get('seg_fields', []), offset, seg_ignore_label,
+                       direction)
+        # check masks and bboxes
+        _check_bbox_mask(results, results_translated, offset, direction,
+                         min_size)
+
+    # test case when level=0 (without translate aug)
+    img_fill_val = (104, 116, 124)
+    seg_ignore_label = 255
+    transform = dict(
+        type='Translate',
+        level=0,
+        prob=1.0,
+        img_fill_val=img_fill_val,
+        seg_ignore_label=seg_ignore_label)
+    translate_module = build_from_cfg(transform, PIPELINES)
+    results_wo_translate = translate_module(copy.deepcopy(results))
+    check_translate(
+        copy.deepcopy(results),
+        results_wo_translate,
+        0,
+        img_fill_val,
+        seg_ignore_label,
+        'horizontal',
+    )
+
+    # test case when level>0 and translate horizontally (left shift).
+    transform = dict(
+        type='Translate',
+        level=8,
+        prob=1.0,
+        img_fill_val=img_fill_val,
+        random_negative_prob=1.0,
+        seg_ignore_label=seg_ignore_label)
+    translate_module = build_from_cfg(transform, PIPELINES)
+    offset = translate_module.offset
+    results_translated = translate_module(copy.deepcopy(results))
+    check_translate(
+        copy.deepcopy(results),
+        results_translated,
+        -offset,
+        img_fill_val,
+        seg_ignore_label,
+        'horizontal',
+    )
+
+    # test case when level>0 and translate horizontally (right shift).
+    translate_module.random_negative_prob = 0.0
+    results_translated = translate_module(copy.deepcopy(results))
+    check_translate(
+        copy.deepcopy(results),
+        results_translated,
+        offset,
+        img_fill_val,
+        seg_ignore_label,
+        'horizontal',
+    )
+
+    # test case when level>0 and translate vertically (top shift).
+    transform = dict(
+        type='Translate',
+        level=10,
+        prob=1.0,
+        img_fill_val=img_fill_val,
+        seg_ignore_label=seg_ignore_label,
+        random_negative_prob=1.0,
+        direction='vertical')
+    translate_module = build_from_cfg(transform, PIPELINES)
+    offset = translate_module.offset
+    results_translated = translate_module(copy.deepcopy(results))
+    check_translate(
+        copy.deepcopy(results), results_translated, -offset, img_fill_val,
+        seg_ignore_label, 'vertical')
+
+    # test case when level>0 and translate vertically (bottom shift).
+    translate_module.random_negative_prob = 0.0
+    results_translated = translate_module(copy.deepcopy(results))
+    check_translate(
+        copy.deepcopy(results), results_translated, offset, img_fill_val,
+        seg_ignore_label, 'vertical')
+
+    # test case when no translation is called (prob<=0)
+    transform = dict(
+        type='Translate',
+        level=8,
+        prob=0.0,
+        img_fill_val=img_fill_val,
+        random_negative_prob=0.0,
+        seg_ignore_label=seg_ignore_label)
+    translate_module = build_from_cfg(transform, PIPELINES)
+    results_translated = translate_module(copy.deepcopy(results))
+
+    # test translate vertically with PolygonMasks (top shift)
+    results = construct_toy_data(False)
+    transform = dict(
+        type='Translate',
+        level=10,
+        prob=1.0,
+        img_fill_val=img_fill_val,
+        seg_ignore_label=seg_ignore_label,
+        direction='vertical')
+    translate_module = build_from_cfg(transform, PIPELINES)
+    offset = translate_module.offset
+    translate_module.random_negative_prob = 1.0
+    results_translated = translate_module(copy.deepcopy(results))
+
+    def _translated_gt(masks, direction, offset, out_shape):
+        translated_masks = []
+        for poly_per_obj in masks:
+            translated_poly_per_obj = []
+            for p in poly_per_obj:
+                p = p.copy()
+                if direction == 'horizontal':
+                    p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+                elif direction == 'vertical':
+                    p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+                if PolygonMasks([[p]], *out_shape).areas[0] > 0:
+                    # filter invalid (area=0)
+                    translated_poly_per_obj.append(p)
+            if len(translated_poly_per_obj):
+                translated_masks.append(translated_poly_per_obj)
+        translated_masks = PolygonMasks(translated_masks, *out_shape)
+        return translated_masks
+
+    h, w = results['img_shape'][:2]
+    for key in results.get('mask_fields', []):
+        masks = results[key]
+        translated_gt = _translated_gt(masks, 'vertical', -offset, (h, w))
+        assert np.equal(results_translated[key].to_ndarray(),
+                        translated_gt.to_ndarray()).all()
+
+    # test translate horizontally with PolygonMasks (right shift)
+    results = construct_toy_data(False)
+    transform = dict(
+        type='Translate',
+        level=8,
+        prob=1.0,
+        img_fill_val=img_fill_val,
+        random_negative_prob=0.0,
+        seg_ignore_label=seg_ignore_label)
+    translate_module = build_from_cfg(transform, PIPELINES)
+    offset = translate_module.offset
+    results_translated = translate_module(copy.deepcopy(results))
+    h, w = results['img_shape'][:2]
+    for key in results.get('mask_fields', []):
+        masks = results[key]
+        translated_gt = _translated_gt(masks, 'horizontal', offset, (h, w))
+        assert np.equal(results_translated[key].to_ndarray(),
+                        translated_gt.to_ndarray()).all()
+
+    # test AutoAugment equipped with Translate
+    policies = [[dict(type='Translate', level=10, prob=1.)]]
+    autoaug = dict(type='AutoAugment', policies=policies)
+    autoaug_module = build_from_cfg(autoaug, PIPELINES)
+    autoaug_module(copy.deepcopy(results))
+
+    policies = [[
+        dict(type='Translate', level=10, prob=1.),
+        dict(
+            type='Translate',
+            level=8,
+            img_fill_val=img_fill_val,
+            direction='vertical')
+    ]]
+    autoaug = dict(type='AutoAugment', policies=policies)
+    autoaug_module = build_from_cfg(autoaug, PIPELINES)
+    autoaug_module(copy.deepcopy(results))
diff --git a/tests/test_data/test_pipelines/test_transform/utils.py b/tests/test_data/test_pipelines/test_transform/utils.py
new file mode 100755
index 0000000..c3b3920
--- /dev/null
+++ b/tests/test_data/test_pipelines/test_transform/utils.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmdet.core.mask import BitmapMasks, PolygonMasks
+
+
+def _check_fields(results, pipeline_results, keys):
+    """Check data in fields from two results are same."""
+    for key in keys:
+        if isinstance(results[key], (BitmapMasks, PolygonMasks)):
+            assert np.equal(results[key].to_ndarray(),
+                            pipeline_results[key].to_ndarray()).all()
+        else:
+            assert np.equal(results[key], pipeline_results[key]).all()
+            assert results[key].dtype == pipeline_results[key].dtype
+
+
+def check_result_same(results, pipeline_results):
+    """Check whether the `pipeline_results` is the same with the predefined
+    `results`.
+
+    Args:
+        results (dict): Predefined results which should be the standard output
+            of the transform pipeline.
+        pipeline_results (dict): Results processed by the transform pipeline.
+    """
+    # check image
+    _check_fields(results, pipeline_results,
+                  results.get('img_fields', ['img']))
+    # check bboxes
+    _check_fields(results, pipeline_results, results.get('bbox_fields', []))
+    # check masks
+    _check_fields(results, pipeline_results, results.get('mask_fields', []))
+    # check segmentations
+    _check_fields(results, pipeline_results, results.get('seg_fields', []))
+    # check gt_labels
+    if 'gt_labels' in results:
+        assert np.equal(results['gt_labels'],
+                        pipeline_results['gt_labels']).all()
+
+
+def construct_toy_data(poly2mask=True):
+    img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint8)
+    img = np.stack([img, img, img], axis=-1)
+    results = dict()
+    # image
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['img_fields'] = ['img']
+    # bboxes
+    results['bbox_fields'] = ['gt_bboxes', 'gt_bboxes_ignore']
+    results['gt_bboxes'] = np.array([[0., 0., 2., 1.]], dtype=np.float32)
+    results['gt_bboxes_ignore'] = np.array([[2., 0., 3., 1.]],
+                                           dtype=np.float32)
+    # labels
+    results['gt_labels'] = np.array([1], dtype=np.int64)
+    # masks
+    results['mask_fields'] = ['gt_masks']
+    if poly2mask:
+        gt_masks = np.array([[0, 1, 1, 0], [0, 1, 0, 0]],
+                            dtype=np.uint8)[None, :, :]
+        results['gt_masks'] = BitmapMasks(gt_masks, 2, 4)
+    else:
+        raw_masks = [[np.array([0, 0, 2, 0, 2, 1, 0, 1], dtype=np.float)]]
+        results['gt_masks'] = PolygonMasks(raw_masks, 2, 4)
+    # segmentations
+    results['seg_fields'] = ['gt_semantic_seg']
+    results['gt_semantic_seg'] = img[..., 0]
+    return results
+
+
+def create_random_bboxes(num_bboxes, img_w, img_h):
+    bboxes_left_top = np.random.uniform(0, 0.5, size=(num_bboxes, 2))
+    bboxes_right_bottom = np.random.uniform(0.5, 1, size=(num_bboxes, 2))
+    bboxes = np.concatenate((bboxes_left_top, bboxes_right_bottom), 1)
+    bboxes = (bboxes * np.array([img_w, img_h, img_w, img_h])).astype(
+        np.float32)
+    return bboxes
+
+
+def create_full_masks(gt_bboxes, img_w, img_h):
+    xmin, ymin = gt_bboxes[:, 0:1], gt_bboxes[:, 1:2]
+    xmax, ymax = gt_bboxes[:, 2:3], gt_bboxes[:, 3:4]
+    gt_masks = np.zeros((len(gt_bboxes), img_h, img_w), dtype=np.uint8)
+    for i in range(len(gt_bboxes)):
+        gt_masks[i, int(ymin[i]):int(ymax[i]), int(xmin[i]):int(xmax[i])] = 1
+    gt_masks = BitmapMasks(gt_masks, img_h, img_w)
+    return gt_masks
diff --git a/tests/test_data/test_utils.py b/tests/test_data/test_utils.py
new file mode 100755
index 0000000..289df32
--- /dev/null
+++ b/tests/test_data/test_utils.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+from mmdet.datasets import get_loading_pipeline, replace_ImageToTensor
+
+
+def test_replace_ImageToTensor():
+    # with MultiScaleFlipAug
+    pipelines = [
+        dict(type='LoadImageFromFile'),
+        dict(
+            type='MultiScaleFlipAug',
+            img_scale=(1333, 800),
+            flip=False,
+            transforms=[
+                dict(type='Resize', keep_ratio=True),
+                dict(type='RandomFlip'),
+                dict(type='Normalize'),
+                dict(type='Pad', size_divisor=32),
+                dict(type='ImageToTensor', keys=['img']),
+                dict(type='Collect', keys=['img']),
+            ])
+    ]
+    expected_pipelines = [
+        dict(type='LoadImageFromFile'),
+        dict(
+            type='MultiScaleFlipAug',
+            img_scale=(1333, 800),
+            flip=False,
+            transforms=[
+                dict(type='Resize', keep_ratio=True),
+                dict(type='RandomFlip'),
+                dict(type='Normalize'),
+                dict(type='Pad', size_divisor=32),
+                dict(type='DefaultFormatBundle'),
+                dict(type='Collect', keys=['img']),
+            ])
+    ]
+    with pytest.warns(UserWarning):
+        assert expected_pipelines == replace_ImageToTensor(pipelines)
+
+    # without MultiScaleFlipAug
+    pipelines = [
+        dict(type='LoadImageFromFile'),
+        dict(type='Resize', keep_ratio=True),
+        dict(type='RandomFlip'),
+        dict(type='Normalize'),
+        dict(type='Pad', size_divisor=32),
+        dict(type='ImageToTensor', keys=['img']),
+        dict(type='Collect', keys=['img']),
+    ]
+    expected_pipelines = [
+        dict(type='LoadImageFromFile'),
+        dict(type='Resize', keep_ratio=True),
+        dict(type='RandomFlip'),
+        dict(type='Normalize'),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle'),
+        dict(type='Collect', keys=['img']),
+    ]
+    with pytest.warns(UserWarning):
+        assert expected_pipelines == replace_ImageToTensor(pipelines)
+
+
+def test_get_loading_pipeline():
+    pipelines = [
+        dict(type='LoadImageFromFile'),
+        dict(type='LoadAnnotations', with_bbox=True),
+        dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+        dict(type='RandomFlip', flip_ratio=0.5),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle'),
+        dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+    ]
+    expected_pipelines = [
+        dict(type='LoadImageFromFile'),
+        dict(type='LoadAnnotations', with_bbox=True)
+    ]
+    assert expected_pipelines == \
+           get_loading_pipeline(pipelines)
diff --git a/tests/test_downstream/test_mmtrack.py b/tests/test_downstream/test_mmtrack.py
new file mode 100755
index 0000000..b709d5b
--- /dev/null
+++ b/tests/test_downstream/test_mmtrack.py
@@ -0,0 +1,230 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import defaultdict
+
+import numpy as np
+import pytest
+import torch
+from mmcv import Config
+
+
+@pytest.mark.parametrize(
+    'cfg_file',
+    ['./tests/data/configs_mmtrack/selsa_faster_rcnn_r101_dc5_1x.py'])
+def test_vid_fgfa_style_forward(cfg_file):
+    config = Config.fromfile(cfg_file)
+    model = copy.deepcopy(config.model)
+    model.pretrains = None
+    model.detector.pretrained = None
+
+    from mmtrack.models import build_model
+    detector = build_model(model)
+
+    # Test forward train with a non-empty truth batch
+    input_shape = (1, 3, 256, 256)
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    img_metas[0]['is_video_data'] = True
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    gt_masks = mm_inputs['gt_masks']
+
+    ref_input_shape = (2, 3, 256, 256)
+    ref_mm_inputs = _demo_mm_inputs(ref_input_shape, num_items=[9, 11])
+    ref_img = ref_mm_inputs.pop('imgs')[None]
+    ref_img_metas = ref_mm_inputs.pop('img_metas')
+    ref_img_metas[0]['is_video_data'] = True
+    ref_img_metas[1]['is_video_data'] = True
+    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
+    ref_gt_labels = ref_mm_inputs['gt_labels']
+    ref_gt_masks = ref_mm_inputs['gt_masks']
+
+    losses = detector.forward(
+        img=imgs,
+        img_metas=img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        ref_img=ref_img,
+        ref_img_metas=[ref_img_metas],
+        ref_gt_bboxes=ref_gt_bboxes,
+        ref_gt_labels=ref_gt_labels,
+        gt_masks=gt_masks,
+        ref_gt_masks=ref_gt_masks,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    loss.requires_grad_(True)
+    assert float(loss.item()) > 0
+    loss.backward()
+
+    # Test forward train with an empty truth batch
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    img_metas[0]['is_video_data'] = True
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    gt_masks = mm_inputs['gt_masks']
+
+    ref_mm_inputs = _demo_mm_inputs(ref_input_shape, num_items=[0, 0])
+    ref_imgs = ref_mm_inputs.pop('imgs')[None]
+    ref_img_metas = ref_mm_inputs.pop('img_metas')
+    ref_img_metas[0]['is_video_data'] = True
+    ref_img_metas[1]['is_video_data'] = True
+    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
+    ref_gt_labels = ref_mm_inputs['gt_labels']
+    ref_gt_masks = ref_mm_inputs['gt_masks']
+
+    losses = detector.forward(
+        img=imgs,
+        img_metas=img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        ref_img=ref_imgs,
+        ref_img_metas=[ref_img_metas],
+        ref_gt_bboxes=ref_gt_bboxes,
+        ref_gt_labels=ref_gt_labels,
+        gt_masks=gt_masks,
+        ref_gt_masks=ref_gt_masks,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    loss.requires_grad_(True)
+    assert float(loss.item()) > 0
+    loss.backward()
+
+    # Test forward test with frame_stride=1 and frame_range=[-1,0]
+    with torch.no_grad():
+        imgs = torch.cat([imgs, imgs.clone()], dim=0)
+        img_list = [g[None, :] for g in imgs]
+        img_metas.extend(copy.deepcopy(img_metas))
+        for i in range(len(img_metas)):
+            img_metas[i]['frame_id'] = i
+            img_metas[i]['num_left_ref_imgs'] = 1
+            img_metas[i]['frame_stride'] = 1
+        ref_imgs = [ref_imgs.clone(), imgs[[0]][None].clone()]
+        ref_img_metas = [
+            copy.deepcopy(ref_img_metas),
+            copy.deepcopy([img_metas[0]])
+        ]
+        results = defaultdict(list)
+        for one_img, one_meta, ref_img, ref_img_meta in zip(
+                img_list, img_metas, ref_imgs, ref_img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      ref_img=[ref_img],
+                                      ref_img_metas=[[ref_img_meta]],
+                                      return_loss=False)
+            for k, v in result.items():
+                results[k].append(v)
+
+
+@pytest.mark.parametrize('cfg_file', [
+    './tests/data/configs_mmtrack/tracktor_faster-rcnn_r50_fpn_4e.py',
+])
+def test_tracktor_forward(cfg_file):
+    config = Config.fromfile(cfg_file)
+    model = copy.deepcopy(config.model)
+    model.pretrains = None
+    model.detector.pretrained = None
+
+    from mmtrack.models import build_model
+    mot = build_model(model)
+    mot.eval()
+
+    input_shape = (1, 3, 256, 256)
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10], with_track=True)
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    with torch.no_grad():
+        imgs = torch.cat([imgs, imgs.clone()], dim=0)
+        img_list = [g[None, :] for g in imgs]
+        img2_metas = copy.deepcopy(img_metas)
+        img2_metas[0]['frame_id'] = 1
+        img_metas.extend(img2_metas)
+        results = defaultdict(list)
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = mot.forward([one_img], [[one_meta]], return_loss=False)
+            for k, v in result.items():
+                results[k].append(v)
+
+
+def _demo_mm_inputs(
+        input_shape=(1, 3, 300, 300),
+        num_items=None,
+        num_classes=10,
+        with_track=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+
+        num_items (None | List[int]):
+            specifies the number of boxes in each batch item
+
+        num_classes (int):
+            number of different labels a box might have
+    """
+    from mmdet.core import BitmapMasks
+
+    (N, C, H, W) = input_shape
+
+    rng = np.random.RandomState(0)
+
+    imgs = rng.rand(*input_shape)
+
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+        'frame_id': 0,
+        'img_norm_cfg': {
+            'mean': (128.0, 128.0, 128.0),
+            'std': (10.0, 10.0, 10.0)
+        }
+    } for i in range(N)]
+
+    gt_bboxes = []
+    gt_labels = []
+    gt_masks = []
+    gt_match_indices = []
+
+    for batch_idx in range(N):
+        if num_items is None:
+            num_boxes = rng.randint(1, 10)
+        else:
+            num_boxes = num_items[batch_idx]
+
+        cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+        tl_x = ((cx * W) - (W * bw / 2)).clip(0, W)
+        tl_y = ((cy * H) - (H * bh / 2)).clip(0, H)
+        br_x = ((cx * W) + (W * bw / 2)).clip(0, W)
+        br_y = ((cy * H) + (H * bh / 2)).clip(0, H)
+
+        boxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+        class_idxs = rng.randint(1, num_classes, size=num_boxes)
+
+        gt_bboxes.append(torch.FloatTensor(boxes))
+        gt_labels.append(torch.LongTensor(class_idxs))
+        if with_track:
+            gt_match_indices.append(torch.arange(boxes.shape[0]))
+
+    mask = np.random.randint(0, 2, (len(boxes), H, W), dtype=np.uint8)
+    gt_masks.append(BitmapMasks(mask, H, W))
+
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_bboxes': gt_bboxes,
+        'gt_labels': gt_labels,
+        'gt_bboxes_ignore': None,
+        'gt_masks': gt_masks,
+    }
+    if with_track:
+        mm_inputs['gt_match_indices'] = gt_match_indices
+    return mm_inputs
diff --git a/tests/test_metrics/test_box_overlap.py b/tests/test_metrics/test_box_overlap.py
new file mode 100755
index 0000000..1d03253
--- /dev/null
+++ b/tests/test_metrics/test_box_overlap.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet.core import BboxOverlaps2D, bbox_overlaps
+from mmdet.core.evaluation.bbox_overlaps import \
+    bbox_overlaps as recall_overlaps
+
+
+def test_bbox_overlaps_2d(eps=1e-7):
+
+    def _construct_bbox(num_bbox=None):
+        img_h = int(np.random.randint(3, 1000))
+        img_w = int(np.random.randint(3, 1000))
+        if num_bbox is None:
+            num_bbox = np.random.randint(1, 10)
+        x1y1 = torch.rand((num_bbox, 2))
+        x2y2 = torch.max(torch.rand((num_bbox, 2)), x1y1)
+        bboxes = torch.cat((x1y1, x2y2), -1)
+        bboxes[:, 0::2] *= img_w
+        bboxes[:, 1::2] *= img_h
+        return bboxes, num_bbox
+
+    # is_aligned is True, bboxes.size(-1) == 5 (include score)
+    self = BboxOverlaps2D()
+    bboxes1, num_bbox = _construct_bbox()
+    bboxes2, _ = _construct_bbox(num_bbox)
+    bboxes1 = torch.cat((bboxes1, torch.rand((num_bbox, 1))), 1)
+    bboxes2 = torch.cat((bboxes2, torch.rand((num_bbox, 1))), 1)
+    gious = self(bboxes1, bboxes2, 'giou', True)
+    assert gious.size() == (num_bbox, ), gious.size()
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+
+    # is_aligned is True, bboxes1.size(-2) == 0
+    bboxes1 = torch.empty((0, 4))
+    bboxes2 = torch.empty((0, 4))
+    gious = self(bboxes1, bboxes2, 'giou', True)
+    assert gious.size() == (0, ), gious.size()
+    assert torch.all(gious == torch.empty((0, )))
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+
+    # is_aligned is True, and bboxes.ndims > 2
+    bboxes1, num_bbox = _construct_bbox()
+    bboxes2, _ = _construct_bbox(num_bbox)
+    bboxes1 = bboxes1.unsqueeze(0).repeat(2, 1, 1)
+    # test assertion when batch dim is not the same
+    with pytest.raises(AssertionError):
+        self(bboxes1, bboxes2.unsqueeze(0).repeat(3, 1, 1), 'giou', True)
+    bboxes2 = bboxes2.unsqueeze(0).repeat(2, 1, 1)
+    gious = self(bboxes1, bboxes2, 'giou', True)
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+    assert gious.size() == (2, num_bbox)
+    bboxes1 = bboxes1.unsqueeze(0).repeat(2, 1, 1, 1)
+    bboxes2 = bboxes2.unsqueeze(0).repeat(2, 1, 1, 1)
+    gious = self(bboxes1, bboxes2, 'giou', True)
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+    assert gious.size() == (2, 2, num_bbox)
+
+    # is_aligned is False
+    bboxes1, num_bbox1 = _construct_bbox()
+    bboxes2, num_bbox2 = _construct_bbox()
+    gious = self(bboxes1, bboxes2, 'giou')
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+    assert gious.size() == (num_bbox1, num_bbox2)
+
+    # is_aligned is False, and bboxes.ndims > 2
+    bboxes1 = bboxes1.unsqueeze(0).repeat(2, 1, 1)
+    bboxes2 = bboxes2.unsqueeze(0).repeat(2, 1, 1)
+    gious = self(bboxes1, bboxes2, 'giou')
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+    assert gious.size() == (2, num_bbox1, num_bbox2)
+    bboxes1 = bboxes1.unsqueeze(0)
+    bboxes2 = bboxes2.unsqueeze(0)
+    gious = self(bboxes1, bboxes2, 'giou')
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+    assert gious.size() == (1, 2, num_bbox1, num_bbox2)
+
+    # is_aligned is False, bboxes1.size(-2) == 0
+    gious = self(torch.empty(1, 2, 0, 4), bboxes2, 'giou')
+    assert torch.all(gious == torch.empty(1, 2, 0, bboxes2.size(-2)))
+    assert torch.all(gious >= -1) and torch.all(gious <= 1)
+
+    # test allclose between bbox_overlaps and the original official
+    # implementation.
+    bboxes1 = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [32, 32, 38, 42],
+    ])
+    bboxes2 = torch.FloatTensor([
+        [0, 0, 10, 20],
+        [0, 10, 10, 19],
+        [10, 10, 20, 20],
+    ])
+    gious = bbox_overlaps(bboxes1, bboxes2, 'giou', is_aligned=True, eps=eps)
+    gious = gious.numpy().round(4)
+    # the gt is got with four decimal precision.
+    expected_gious = np.array([0.5000, -0.0500, -0.8214])
+    assert np.allclose(gious, expected_gious, rtol=0, atol=eps)
+
+    # test mode 'iof'
+    ious = bbox_overlaps(bboxes1, bboxes2, 'iof', is_aligned=True, eps=eps)
+    assert torch.all(ious >= -1) and torch.all(ious <= 1)
+    assert ious.size() == (bboxes1.size(0), )
+    ious = bbox_overlaps(bboxes1, bboxes2, 'iof', eps=eps)
+    assert torch.all(ious >= -1) and torch.all(ious <= 1)
+    assert ious.size() == (bboxes1.size(0), bboxes2.size(0))
+
+
+def test_voc_recall_overlaps():
+
+    def _construct_bbox(num_bbox=None):
+        img_h = int(np.random.randint(3, 1000))
+        img_w = int(np.random.randint(3, 1000))
+        if num_bbox is None:
+            num_bbox = np.random.randint(1, 10)
+        x1y1 = torch.rand((num_bbox, 2))
+        x2y2 = torch.max(torch.rand((num_bbox, 2)), x1y1)
+        bboxes = torch.cat((x1y1, x2y2), -1)
+        bboxes[:, 0::2] *= img_w
+        bboxes[:, 1::2] *= img_h
+        return bboxes.numpy(), num_bbox
+
+    bboxes1, num_bbox = _construct_bbox()
+    bboxes2, _ = _construct_bbox(num_bbox)
+    ious = recall_overlaps(
+        bboxes1, bboxes2, 'iou', use_legacy_coordinate=False)
+    assert ious.shape == (num_bbox, num_bbox)
+    assert np.all(ious >= -1) and np.all(ious <= 1)
+
+    ious = recall_overlaps(bboxes1, bboxes2, 'iou', use_legacy_coordinate=True)
+    assert ious.shape == (num_bbox, num_bbox)
+    assert np.all(ious >= -1) and np.all(ious <= 1)
diff --git a/tests/test_metrics/test_losses.py b/tests/test_metrics/test_losses.py
new file mode 100755
index 0000000..06fe43d
--- /dev/null
+++ b/tests/test_metrics/test_losses.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models import Accuracy, build_loss
+
+
+def test_ce_loss():
+    # use_mask and use_sigmoid cannot be true at the same time
+    with pytest.raises(AssertionError):
+        loss_cfg = dict(
+            type='CrossEntropyLoss',
+            use_mask=True,
+            use_sigmoid=True,
+            loss_weight=1.0)
+        build_loss(loss_cfg)
+
+    # test loss with class weights
+    loss_cls_cfg = dict(
+        type='CrossEntropyLoss',
+        use_sigmoid=False,
+        class_weight=[0.8, 0.2],
+        loss_weight=1.0)
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[100, -100]])
+    fake_label = torch.Tensor([1]).long()
+    assert torch.allclose(loss_cls(fake_pred, fake_label), torch.tensor(40.))
+
+    loss_cls_cfg = dict(
+        type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)
+    loss_cls = build_loss(loss_cls_cfg)
+    assert torch.allclose(loss_cls(fake_pred, fake_label), torch.tensor(200.))
+
+
+def test_varifocal_loss():
+    # only sigmoid version of VarifocalLoss is implemented
+    with pytest.raises(AssertionError):
+        loss_cfg = dict(
+            type='VarifocalLoss', use_sigmoid=False, loss_weight=1.0)
+        build_loss(loss_cfg)
+
+    # test that alpha should be greater than 0
+    with pytest.raises(AssertionError):
+        loss_cfg = dict(
+            type='VarifocalLoss',
+            alpha=-0.75,
+            gamma=2.0,
+            use_sigmoid=True,
+            loss_weight=1.0)
+        build_loss(loss_cfg)
+
+    # test that pred and target should be of the same size
+    loss_cls_cfg = dict(
+        type='VarifocalLoss',
+        use_sigmoid=True,
+        alpha=0.75,
+        gamma=2.0,
+        iou_weighted=True,
+        reduction='mean',
+        loss_weight=1.0)
+    loss_cls = build_loss(loss_cls_cfg)
+    with pytest.raises(AssertionError):
+        fake_pred = torch.Tensor([[100.0, -100.0]])
+        fake_target = torch.Tensor([[1.0]])
+        loss_cls(fake_pred, fake_target)
+
+    # test the calculation
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[100.0, -100.0]])
+    fake_target = torch.Tensor([[1.0, 0.0]])
+    assert torch.allclose(loss_cls(fake_pred, fake_target), torch.tensor(0.0))
+
+    # test the loss with weights
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[0.0, 100.0]])
+    fake_target = torch.Tensor([[1.0, 1.0]])
+    fake_weight = torch.Tensor([0.0, 1.0])
+    assert torch.allclose(
+        loss_cls(fake_pred, fake_target, fake_weight), torch.tensor(0.0))
+
+
+def test_kd_loss():
+    # test that temperature should be greater than 1
+    with pytest.raises(AssertionError):
+        loss_cfg = dict(
+            type='KnowledgeDistillationKLDivLoss', loss_weight=1.0, T=0.5)
+        build_loss(loss_cfg)
+
+    # test that pred and target should be of the same size
+    loss_cls_cfg = dict(
+        type='KnowledgeDistillationKLDivLoss', loss_weight=1.0, T=1)
+    loss_cls = build_loss(loss_cls_cfg)
+    with pytest.raises(AssertionError):
+        fake_pred = torch.Tensor([[100, -100]])
+        fake_label = torch.Tensor([1]).long()
+        loss_cls(fake_pred, fake_label)
+
+    # test the calculation
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[100.0, 100.0]])
+    fake_target = torch.Tensor([[1.0, 1.0]])
+    assert torch.allclose(loss_cls(fake_pred, fake_target), torch.tensor(0.0))
+
+    # test the loss with weights
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[100.0, -100.0], [100.0, 100.0]])
+    fake_target = torch.Tensor([[1.0, 0.0], [1.0, 1.0]])
+    fake_weight = torch.Tensor([0.0, 1.0])
+    assert torch.allclose(
+        loss_cls(fake_pred, fake_target, fake_weight), torch.tensor(0.0))
+
+
+def test_seesaw_loss():
+    # only softmax version of Seesaw Loss is implemented
+    with pytest.raises(AssertionError):
+        loss_cfg = dict(type='SeesawLoss', use_sigmoid=True, loss_weight=1.0)
+        build_loss(loss_cfg)
+
+    # test that cls_score.size(-1) == num_classes + 2
+    loss_cls_cfg = dict(
+        type='SeesawLoss', p=0.0, q=0.0, loss_weight=1.0, num_classes=2)
+    loss_cls = build_loss(loss_cls_cfg)
+    # the length of fake_pred should be num_classes + 2 = 4
+    with pytest.raises(AssertionError):
+        fake_pred = torch.Tensor([[-100, 100]])
+        fake_label = torch.Tensor([1]).long()
+        loss_cls(fake_pred, fake_label)
+    # the length of fake_pred should be num_classes + 2 = 4
+    with pytest.raises(AssertionError):
+        fake_pred = torch.Tensor([[-100, 100, -100]])
+        fake_label = torch.Tensor([1]).long()
+        loss_cls(fake_pred, fake_label)
+
+    # test the calculation without p and q
+    loss_cls_cfg = dict(
+        type='SeesawLoss', p=0.0, q=0.0, loss_weight=1.0, num_classes=2)
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[-100, 100, -100, 100]])
+    fake_label = torch.Tensor([1]).long()
+    loss = loss_cls(fake_pred, fake_label)
+    assert torch.allclose(loss['loss_cls_objectness'], torch.tensor(200.))
+    assert torch.allclose(loss['loss_cls_classes'], torch.tensor(0.))
+
+    # test the calculation with p and without q
+    loss_cls_cfg = dict(
+        type='SeesawLoss', p=1.0, q=0.0, loss_weight=1.0, num_classes=2)
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[-100, 100, -100, 100]])
+    fake_label = torch.Tensor([0]).long()
+    loss_cls.cum_samples[0] = torch.exp(torch.Tensor([20]))
+    loss = loss_cls(fake_pred, fake_label)
+    assert torch.allclose(loss['loss_cls_objectness'], torch.tensor(200.))
+    assert torch.allclose(loss['loss_cls_classes'], torch.tensor(180.))
+
+    # test the calculation with q and without p
+    loss_cls_cfg = dict(
+        type='SeesawLoss', p=0.0, q=1.0, loss_weight=1.0, num_classes=2)
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[-100, 100, -100, 100]])
+    fake_label = torch.Tensor([0]).long()
+    loss = loss_cls(fake_pred, fake_label)
+    assert torch.allclose(loss['loss_cls_objectness'], torch.tensor(200.))
+    assert torch.allclose(loss['loss_cls_classes'],
+                          torch.tensor(200.) + torch.tensor(100.).log())
+
+    # test the others
+    loss_cls_cfg = dict(
+        type='SeesawLoss',
+        p=0.0,
+        q=1.0,
+        loss_weight=1.0,
+        num_classes=2,
+        return_dict=False)
+    loss_cls = build_loss(loss_cls_cfg)
+    fake_pred = torch.Tensor([[100, -100, 100, -100]])
+    fake_label = torch.Tensor([0]).long()
+    loss = loss_cls(fake_pred, fake_label)
+    acc = loss_cls.get_accuracy(fake_pred, fake_label)
+    act = loss_cls.get_activation(fake_pred)
+    assert torch.allclose(loss, torch.tensor(0.))
+    assert torch.allclose(acc['acc_objectness'], torch.tensor(100.))
+    assert torch.allclose(acc['acc_classes'], torch.tensor(100.))
+    assert torch.allclose(act, torch.tensor([1., 0., 0.]))
+
+
+def test_accuracy():
+    # test for empty pred
+    pred = torch.empty(0, 4)
+    label = torch.empty(0)
+    accuracy = Accuracy(topk=1)
+    acc = accuracy(pred, label)
+    assert acc.item() == 0
+
+    pred = torch.Tensor([[0.2, 0.3, 0.6, 0.5], [0.1, 0.1, 0.2, 0.6],
+                         [0.9, 0.0, 0.0, 0.1], [0.4, 0.7, 0.1, 0.1],
+                         [0.0, 0.0, 0.99, 0]])
+    # test for top1
+    true_label = torch.Tensor([2, 3, 0, 1, 2]).long()
+    accuracy = Accuracy(topk=1)
+    acc = accuracy(pred, true_label)
+    assert acc.item() == 100
+
+    # test for top1 with score thresh=0.8
+    true_label = torch.Tensor([2, 3, 0, 1, 2]).long()
+    accuracy = Accuracy(topk=1, thresh=0.8)
+    acc = accuracy(pred, true_label)
+    assert acc.item() == 40
+
+    # test for top2
+    accuracy = Accuracy(topk=2)
+    label = torch.Tensor([3, 2, 0, 0, 2]).long()
+    acc = accuracy(pred, label)
+    assert acc.item() == 100
+
+    # test for both top1 and top2
+    accuracy = Accuracy(topk=(1, 2))
+    true_label = torch.Tensor([2, 3, 0, 1, 2]).long()
+    acc = accuracy(pred, true_label)
+    for a in acc:
+        assert a.item() == 100
+
+    # topk is larger than pred class number
+    with pytest.raises(AssertionError):
+        accuracy = Accuracy(topk=5)
+        accuracy(pred, true_label)
+
+    # wrong topk type
+    with pytest.raises(AssertionError):
+        accuracy = Accuracy(topk='wrong type')
+        accuracy(pred, true_label)
+
+    # label size is larger than required
+    with pytest.raises(AssertionError):
+        label = torch.Tensor([2, 3, 0, 1, 2, 0]).long()  # size mismatch
+        accuracy = Accuracy()
+        accuracy(pred, label)
+
+    # wrong pred dimension
+    with pytest.raises(AssertionError):
+        accuracy = Accuracy()
+        accuracy(pred[:, :, None], true_label)
diff --git a/tests/test_metrics/test_mean_ap.py b/tests/test_metrics/test_mean_ap.py
new file mode 100755
index 0000000..5faa7a0
--- /dev/null
+++ b/tests/test_metrics/test_mean_ap.py
@@ -0,0 +1,187 @@
+import numpy as np
+
+from mmdet.core.evaluation.mean_ap import (eval_map, tpfp_default,
+                                           tpfp_imagenet, tpfp_openimages)
+
+det_bboxes = np.array([
+    [0, 0, 10, 10],
+    [10, 10, 20, 20],
+    [32, 32, 38, 42],
+])
+gt_bboxes = np.array([[0, 0, 10, 20], [0, 10, 10, 19], [10, 10, 20, 20]])
+gt_ignore = np.array([[5, 5, 10, 20], [6, 10, 10, 19]])
+
+
+def test_tpfp_imagenet():
+
+    result = tpfp_imagenet(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        use_legacy_coordinate=True)
+    tp = result[0]
+    fp = result[1]
+    assert tp.shape == (1, 3)
+    assert fp.shape == (1, 3)
+    assert (tp == np.array([[1, 1, 0]])).all()
+    assert (fp == np.array([[0, 0, 1]])).all()
+
+    result = tpfp_imagenet(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        use_legacy_coordinate=False)
+    tp = result[0]
+    fp = result[1]
+    assert tp.shape == (1, 3)
+    assert fp.shape == (1, 3)
+    assert (tp == np.array([[1, 1, 0]])).all()
+    assert (fp == np.array([[0, 0, 1]])).all()
+
+
+def test_tpfp_default():
+
+    result = tpfp_default(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        use_legacy_coordinate=True)
+
+    tp = result[0]
+    fp = result[1]
+    assert tp.shape == (1, 3)
+    assert fp.shape == (1, 3)
+    assert (tp == np.array([[1, 1, 0]])).all()
+    assert (fp == np.array([[0, 0, 1]])).all()
+    result = tpfp_default(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        use_legacy_coordinate=False)
+
+    tp = result[0]
+    fp = result[1]
+    assert tp.shape == (1, 3)
+    assert fp.shape == (1, 3)
+    assert (tp == np.array([[1, 1, 0]])).all()
+    assert (fp == np.array([[0, 0, 1]])).all()
+
+
+def test_eval_map():
+
+    # 2 image and 2 classes
+    det_results = [[det_bboxes, det_bboxes], [det_bboxes, det_bboxes]]
+
+    labels = np.array([0, 1, 1])
+    labels_ignore = np.array([0, 1])
+    gt_info = {
+        'bboxes': gt_bboxes,
+        'bboxes_ignore': gt_ignore,
+        'labels': labels,
+        'labels_ignore': labels_ignore
+    }
+    annotations = [gt_info, gt_info]
+    mean_ap, eval_results = eval_map(
+        det_results, annotations, use_legacy_coordinate=True)
+    assert 0.291 < mean_ap < 0.293
+    mean_ap, eval_results = eval_map(
+        det_results, annotations, use_legacy_coordinate=False)
+    assert 0.291 < mean_ap < 0.293
+
+    # 1 image and 2 classes
+    det_results = [[det_bboxes, det_bboxes]]
+
+    labels = np.array([0, 1, 1])
+    labels_ignore = np.array([0, 1])
+    gt_info = {
+        'bboxes': gt_bboxes,
+        'bboxes_ignore': gt_ignore,
+        'labels': labels,
+        'labels_ignore': labels_ignore
+    }
+    annotations = [gt_info]
+    mean_ap, eval_results = eval_map(
+        det_results, annotations, use_legacy_coordinate=True)
+    assert 0.291 < mean_ap < 0.293
+    mean_ap, eval_results = eval_map(
+        det_results, annotations, use_legacy_coordinate=False)
+    assert 0.291 < mean_ap < 0.293
+
+
+def test_tpfp_openimages():
+
+    det_bboxes = np.array([[10, 10, 15, 15, 1.0], [15, 15, 30, 30, 0.98],
+                           [10, 10, 25, 25, 0.98], [28, 28, 35, 35, 0.97],
+                           [30, 30, 51, 51, 0.96], [100, 110, 120, 130, 0.15]])
+    gt_bboxes = np.array([[10., 10., 30., 30.], [30., 30., 50., 50.]])
+    gt_groups_of = np.array([True, False], dtype=bool)
+    gt_ignore = np.zeros((0, 4))
+
+    # Open Images evaluation using group of.
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=True,
+        ioa_thr=0.5)
+
+    tp = result[0]
+    fp = result[1]
+    cls_dets = result[2]
+
+    assert tp.shape == (1, 4)
+    assert fp.shape == (1, 4)
+    assert cls_dets.shape == (4, 5)
+
+    assert (tp == np.array([[0, 1, 0, 1]])).all()
+    assert (fp == np.array([[1, 0, 1, 0]])).all()
+    cls_dets_gt = np.array([[28., 28., 35., 35., 0.97],
+                            [30., 30., 51., 51., 0.96],
+                            [100., 110., 120., 130., 0.15],
+                            [10., 10., 15., 15., 1.]])
+    assert (cls_dets == cls_dets_gt).all()
+
+    # Open Images evaluation not using group of.
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=False,
+        ioa_thr=0.5)
+    tp = result[0]
+    fp = result[1]
+    cls_dets = result[2]
+    assert tp.shape == (1, 6)
+    assert fp.shape == (1, 6)
+    assert cls_dets.shape == (6, 5)
+
+    # Open Images evaluation using group of, and gt is all group of bboxes.
+    gt_groups_of = np.array([True, True], dtype=bool)
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=True,
+        ioa_thr=0.5)
+    tp = result[0]
+    fp = result[1]
+    cls_dets = result[2]
+    assert tp.shape == (1, 3)
+    assert fp.shape == (1, 3)
+    assert cls_dets.shape == (3, 5)
+
+    # Open Images evaluation with empty gt.
+    gt_bboxes = np.zeros((0, 4))
+    gt_groups_of = np.empty((0))
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=True,
+        ioa_thr=0.5)
+    fp = result[1]
+    assert (fp == np.array([[1, 1, 1, 1, 1, 1]])).all()
diff --git a/tests/test_metrics/test_recall.py b/tests/test_metrics/test_recall.py
new file mode 100755
index 0000000..f2ca0b1
--- /dev/null
+++ b/tests/test_metrics/test_recall.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+from mmdet.core.evaluation.recall import eval_recalls
+
+det_bboxes = np.array([
+    [0, 0, 10, 10],
+    [10, 10, 20, 20],
+    [32, 32, 38, 42],
+])
+gt_bboxes = np.array([[0, 0, 10, 20], [0, 10, 10, 19], [10, 10, 20, 20]])
+gt_ignore = np.array([[5, 5, 10, 20], [6, 10, 10, 19]])
+
+
+def test_eval_recalls():
+    gts = [gt_bboxes, gt_bboxes, gt_bboxes]
+    proposals = [det_bboxes, det_bboxes, det_bboxes]
+
+    recall = eval_recalls(
+        gts, proposals, proposal_nums=2, use_legacy_coordinate=True)
+    assert recall.shape == (1, 1)
+    assert 0.66 < recall[0][0] < 0.667
+    recall = eval_recalls(
+        gts, proposals, proposal_nums=2, use_legacy_coordinate=False)
+    assert recall.shape == (1, 1)
+    assert 0.66 < recall[0][0] < 0.667
+
+    recall = eval_recalls(
+        gts, proposals, proposal_nums=2, use_legacy_coordinate=True)
+    assert recall.shape == (1, 1)
+    assert 0.66 < recall[0][0] < 0.667
+    recall = eval_recalls(
+        gts,
+        proposals,
+        iou_thrs=[0.1, 0.9],
+        proposal_nums=2,
+        use_legacy_coordinate=False)
+    assert recall.shape == (1, 2)
+    assert recall[0][1] <= recall[0][0]
+    recall = eval_recalls(
+        gts,
+        proposals,
+        iou_thrs=[0.1, 0.9],
+        proposal_nums=2,
+        use_legacy_coordinate=True)
+    assert recall.shape == (1, 2)
+    assert recall[0][1] <= recall[0][0]
diff --git a/tests/test_models/test_backbones/__init__.py b/tests/test_models/test_backbones/__init__.py
new file mode 100755
index 0000000..eb431ba
--- /dev/null
+++ b/tests/test_models/test_backbones/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .utils import check_norm_state, is_block, is_norm
+
+__all__ = ['is_block', 'is_norm', 'check_norm_state']
diff --git a/tests/test_models/test_backbones/test_csp_darknet.py b/tests/test_models/test_backbones/test_csp_darknet.py
new file mode 100755
index 0000000..2a2ad41
--- /dev/null
+++ b/tests/test_models/test_backbones/test_csp_darknet.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.models.backbones.csp_darknet import CSPDarknet
+from .utils import check_norm_state, is_norm
+
+
+def test_csp_darknet_backbone():
+    with pytest.raises(ValueError):
+        # frozen_stages must in range(-1, len(arch_setting) + 1)
+        CSPDarknet(frozen_stages=6)
+
+    with pytest.raises(AssertionError):
+        # out_indices in range(len(arch_setting) + 1)
+        CSPDarknet(out_indices=[6])
+
+    # Test CSPDarknet with first stage frozen
+    frozen_stages = 1
+    model = CSPDarknet(frozen_stages=frozen_stages)
+    model.train()
+
+    for mod in model.stem.modules():
+        for param in mod.parameters():
+            assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(model, f'stage{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test CSPDarknet with norm_eval=True
+    model = CSPDarknet(norm_eval=True)
+    model.train()
+
+    assert check_norm_state(model.modules(), False)
+
+    # Test CSPDarknet-P5 forward with widen_factor=0.5
+    model = CSPDarknet(arch='P5', widen_factor=0.25, out_indices=range(0, 5))
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 5
+    assert feat[0].shape == torch.Size((1, 16, 32, 32))
+    assert feat[1].shape == torch.Size((1, 32, 16, 16))
+    assert feat[2].shape == torch.Size((1, 64, 8, 8))
+    assert feat[3].shape == torch.Size((1, 128, 4, 4))
+    assert feat[4].shape == torch.Size((1, 256, 2, 2))
+
+    # Test CSPDarknet-P6 forward with widen_factor=0.5
+    model = CSPDarknet(
+        arch='P6',
+        widen_factor=0.25,
+        out_indices=range(0, 6),
+        spp_kernal_sizes=(3, 5, 7))
+    model.train()
+
+    imgs = torch.randn(1, 3, 128, 128)
+    feat = model(imgs)
+    assert feat[0].shape == torch.Size((1, 16, 64, 64))
+    assert feat[1].shape == torch.Size((1, 32, 32, 32))
+    assert feat[2].shape == torch.Size((1, 64, 16, 16))
+    assert feat[3].shape == torch.Size((1, 128, 8, 8))
+    assert feat[4].shape == torch.Size((1, 192, 4, 4))
+    assert feat[5].shape == torch.Size((1, 256, 2, 2))
+
+    # Test CSPDarknet forward with dict(type='ReLU')
+    model = CSPDarknet(
+        widen_factor=0.125, act_cfg=dict(type='ReLU'), out_indices=range(0, 5))
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 5
+    assert feat[0].shape == torch.Size((1, 8, 32, 32))
+    assert feat[1].shape == torch.Size((1, 16, 16, 16))
+    assert feat[2].shape == torch.Size((1, 32, 8, 8))
+    assert feat[3].shape == torch.Size((1, 64, 4, 4))
+    assert feat[4].shape == torch.Size((1, 128, 2, 2))
+
+    # Test CSPDarknet with BatchNorm forward
+    model = CSPDarknet(widen_factor=0.125, out_indices=range(0, 5))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 5
+    assert feat[0].shape == torch.Size((1, 8, 32, 32))
+    assert feat[1].shape == torch.Size((1, 16, 16, 16))
+    assert feat[2].shape == torch.Size((1, 32, 8, 8))
+    assert feat[3].shape == torch.Size((1, 64, 4, 4))
+    assert feat[4].shape == torch.Size((1, 128, 2, 2))
+
+    # Test CSPDarknet with custom arch forward
+    arch_ovewrite = [[32, 56, 3, True, False], [56, 224, 2, True, False],
+                     [224, 512, 1, True, False]]
+    model = CSPDarknet(
+        arch_ovewrite=arch_ovewrite,
+        widen_factor=0.25,
+        out_indices=(0, 1, 2, 3))
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size((1, 8, 16, 16))
+    assert feat[1].shape == torch.Size((1, 14, 8, 8))
+    assert feat[2].shape == torch.Size((1, 56, 4, 4))
+    assert feat[3].shape == torch.Size((1, 128, 2, 2))
diff --git a/tests/test_models/test_backbones/test_detectors_resnet.py b/tests/test_models/test_backbones/test_detectors_resnet.py
new file mode 100755
index 0000000..69f462a
--- /dev/null
+++ b/tests/test_models/test_backbones/test_detectors_resnet.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+from mmdet.models.backbones import DetectoRS_ResNet
+
+
+def test_detectorrs_resnet_backbone():
+    detectorrs_cfg = dict(
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True),
+        output_img=True)
+    """Test init_weights config"""
+    with pytest.raises(AssertionError):
+        # pretrained and init_cfg cannot be specified at the same time
+        DetectoRS_ResNet(
+            **detectorrs_cfg, pretrained='Pretrained', init_cfg='Pretrained')
+
+    with pytest.raises(AssertionError):
+        # init_cfg must be a dict
+        DetectoRS_ResNet(
+            **detectorrs_cfg, pretrained=None, init_cfg=['Pretrained'])
+
+    with pytest.raises(KeyError):
+        # init_cfg must contain the key `type`
+        DetectoRS_ResNet(
+            **detectorrs_cfg,
+            pretrained=None,
+            init_cfg=dict(checkpoint='Pretrained'))
+
+    with pytest.raises(AssertionError):
+        # init_cfg only support initialize pretrained model way
+        DetectoRS_ResNet(
+            **detectorrs_cfg, pretrained=None, init_cfg=dict(type='Trained'))
+
+    with pytest.raises(TypeError):
+        # pretrained mast be a str or None
+        model = DetectoRS_ResNet(
+            **detectorrs_cfg, pretrained=['Pretrained'], init_cfg=None)
+        model.init_weights()
diff --git a/tests/test_models/test_backbones/test_efficientnet.py b/tests/test_models/test_backbones/test_efficientnet.py
new file mode 100755
index 0000000..aa21770
--- /dev/null
+++ b/tests/test_models/test_backbones/test_efficientnet.py
@@ -0,0 +1,25 @@
+import pytest
+import torch
+
+from mmdet.models.backbones import EfficientNet
+
+
+def test_efficientnet_backbone():
+    """Test EfficientNet backbone."""
+    with pytest.raises(AssertionError):
+        # EfficientNet arch should be a key in EfficientNet.arch_settings
+        EfficientNet(arch='c3')
+
+    model = EfficientNet(arch='b0', out_indices=(0, 1, 2, 3, 4, 5, 6))
+    model.train()
+
+    imgs = torch.randn(2, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size([2, 32, 16, 16])
+    assert feat[1].shape == torch.Size([2, 16, 16, 16])
+    assert feat[2].shape == torch.Size([2, 24, 8, 8])
+    assert feat[3].shape == torch.Size([2, 40, 4, 4])
+    assert feat[4].shape == torch.Size([2, 112, 2, 2])
+    assert feat[5].shape == torch.Size([2, 320, 1, 1])
+    assert feat[6].shape == torch.Size([2, 1280, 1, 1])
diff --git a/tests/test_models/test_backbones/test_hourglass.py b/tests/test_models/test_backbones/test_hourglass.py
new file mode 100755
index 0000000..c26f9c0
--- /dev/null
+++ b/tests/test_models/test_backbones/test_hourglass.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.backbones.hourglass import HourglassNet
+
+
+def test_hourglass_backbone():
+    with pytest.raises(AssertionError):
+        # HourglassNet's num_stacks should larger than 0
+        HourglassNet(num_stacks=0)
+
+    with pytest.raises(AssertionError):
+        # len(stage_channels) should equal len(stage_blocks)
+        HourglassNet(
+            stage_channels=[256, 256, 384, 384, 384],
+            stage_blocks=[2, 2, 2, 2, 2, 4])
+
+    with pytest.raises(AssertionError):
+        # len(stage_channels) should lagrer than downsample_times
+        HourglassNet(
+            downsample_times=5,
+            stage_channels=[256, 256, 384, 384, 384],
+            stage_blocks=[2, 2, 2, 2, 2])
+
+    # Test HourglassNet-52
+    model = HourglassNet(
+        num_stacks=1,
+        stage_channels=(64, 64, 96, 96, 96, 128),
+        feat_channel=64)
+    model.train()
+
+    imgs = torch.randn(1, 3, 256, 256)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 64, 64, 64])
+
+    # Test HourglassNet-104
+    model = HourglassNet(
+        num_stacks=2,
+        stage_channels=(64, 64, 96, 96, 96, 128),
+        feat_channel=64)
+    model.train()
+
+    imgs = torch.randn(1, 3, 256, 256)
+    feat = model(imgs)
+    assert len(feat) == 2
+    assert feat[0].shape == torch.Size([1, 64, 64, 64])
+    assert feat[1].shape == torch.Size([1, 64, 64, 64])
diff --git a/tests/test_models/test_backbones/test_hrnet.py b/tests/test_models/test_backbones/test_hrnet.py
new file mode 100755
index 0000000..6ae367b
--- /dev/null
+++ b/tests/test_models/test_backbones/test_hrnet.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.backbones.hrnet import HRModule, HRNet
+from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+
+
+@pytest.mark.parametrize('block', [BasicBlock, Bottleneck])
+def test_hrmodule(block):
+    # Test multiscale forward
+    num_channles = (32, 64)
+    in_channels = [c * block.expansion for c in num_channles]
+    hrmodule = HRModule(
+        num_branches=2,
+        blocks=block,
+        in_channels=in_channels,
+        num_blocks=(4, 4),
+        num_channels=num_channles,
+    )
+
+    feats = [
+        torch.randn(1, in_channels[0], 64, 64),
+        torch.randn(1, in_channels[1], 32, 32)
+    ]
+    feats = hrmodule(feats)
+
+    assert len(feats) == 2
+    assert feats[0].shape == torch.Size([1, in_channels[0], 64, 64])
+    assert feats[1].shape == torch.Size([1, in_channels[1], 32, 32])
+
+    # Test single scale forward
+    num_channles = (32, 64)
+    in_channels = [c * block.expansion for c in num_channles]
+    hrmodule = HRModule(
+        num_branches=2,
+        blocks=block,
+        in_channels=in_channels,
+        num_blocks=(4, 4),
+        num_channels=num_channles,
+        multiscale_output=False,
+    )
+
+    feats = [
+        torch.randn(1, in_channels[0], 64, 64),
+        torch.randn(1, in_channels[1], 32, 32)
+    ]
+    feats = hrmodule(feats)
+
+    assert len(feats) == 1
+    assert feats[0].shape == torch.Size([1, in_channels[0], 64, 64])
+
+
+def test_hrnet_backbone():
+    # only have 3 stages
+    extra = dict(
+        stage1=dict(
+            num_modules=1,
+            num_branches=1,
+            block='BOTTLENECK',
+            num_blocks=(4, ),
+            num_channels=(64, )),
+        stage2=dict(
+            num_modules=1,
+            num_branches=2,
+            block='BASIC',
+            num_blocks=(4, 4),
+            num_channels=(32, 64)),
+        stage3=dict(
+            num_modules=4,
+            num_branches=3,
+            block='BASIC',
+            num_blocks=(4, 4, 4),
+            num_channels=(32, 64, 128)))
+
+    with pytest.raises(AssertionError):
+        # HRNet now only support 4 stages
+        HRNet(extra=extra)
+    extra['stage4'] = dict(
+        num_modules=3,
+        num_branches=3,  # should be 4
+        block='BASIC',
+        num_blocks=(4, 4, 4, 4),
+        num_channels=(32, 64, 128, 256))
+
+    with pytest.raises(AssertionError):
+        # len(num_blocks) should equal num_branches
+        HRNet(extra=extra)
+
+    extra['stage4']['num_branches'] = 4
+
+    # Test hrnetv2p_w32
+    model = HRNet(extra=extra)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 256, 256)
+    feats = model(imgs)
+    assert len(feats) == 4
+    assert feats[0].shape == torch.Size([1, 32, 64, 64])
+    assert feats[3].shape == torch.Size([1, 256, 8, 8])
+
+    # Test single scale output
+    model = HRNet(extra=extra, multiscale_output=False)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 256, 256)
+    feats = model(imgs)
+    assert len(feats) == 1
+    assert feats[0].shape == torch.Size([1, 32, 64, 64])
diff --git a/tests/test_models/test_backbones/test_mobilenet_v2.py b/tests/test_models/test_backbones/test_mobilenet_v2.py
new file mode 100755
index 0000000..77df7ea
--- /dev/null
+++ b/tests/test_models/test_backbones/test_mobilenet_v2.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.models.backbones.mobilenet_v2 import MobileNetV2
+from .utils import check_norm_state, is_block, is_norm
+
+
+def test_mobilenetv2_backbone():
+    with pytest.raises(ValueError):
+        # frozen_stages must in range(-1, 8)
+        MobileNetV2(frozen_stages=8)
+
+    with pytest.raises(ValueError):
+        # out_indices in range(-1, 8)
+        MobileNetV2(out_indices=[8])
+
+    # Test MobileNetV2 with first stage frozen
+    frozen_stages = 1
+    model = MobileNetV2(frozen_stages=frozen_stages)
+    model.train()
+
+    for mod in model.conv1.modules():
+        for param in mod.parameters():
+            assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(model, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test MobileNetV2 with norm_eval=True
+    model = MobileNetV2(norm_eval=True)
+    model.train()
+
+    assert check_norm_state(model.modules(), False)
+
+    # Test MobileNetV2 forward with widen_factor=1.0
+    model = MobileNetV2(widen_factor=1.0, out_indices=range(0, 8))
+    model.train()
+
+    assert check_norm_state(model.modules(), True)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 8
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+    assert feat[7].shape == torch.Size((1, 1280, 7, 7))
+
+    # Test MobileNetV2 forward with widen_factor=0.5
+    model = MobileNetV2(widen_factor=0.5, out_indices=range(0, 7))
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 8, 112, 112))
+    assert feat[1].shape == torch.Size((1, 16, 56, 56))
+    assert feat[2].shape == torch.Size((1, 16, 28, 28))
+    assert feat[3].shape == torch.Size((1, 32, 14, 14))
+    assert feat[4].shape == torch.Size((1, 48, 14, 14))
+    assert feat[5].shape == torch.Size((1, 80, 7, 7))
+    assert feat[6].shape == torch.Size((1, 160, 7, 7))
+
+    # Test MobileNetV2 forward with widen_factor=2.0
+    model = MobileNetV2(widen_factor=2.0, out_indices=range(0, 8))
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert feat[0].shape == torch.Size((1, 32, 112, 112))
+    assert feat[1].shape == torch.Size((1, 48, 56, 56))
+    assert feat[2].shape == torch.Size((1, 64, 28, 28))
+    assert feat[3].shape == torch.Size((1, 128, 14, 14))
+    assert feat[4].shape == torch.Size((1, 192, 14, 14))
+    assert feat[5].shape == torch.Size((1, 320, 7, 7))
+    assert feat[6].shape == torch.Size((1, 640, 7, 7))
+    assert feat[7].shape == torch.Size((1, 2560, 7, 7))
+
+    # Test MobileNetV2 forward with dict(type='ReLU')
+    model = MobileNetV2(
+        widen_factor=1.0, act_cfg=dict(type='ReLU'), out_indices=range(0, 7))
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+
+    # Test MobileNetV2 with BatchNorm forward
+    model = MobileNetV2(widen_factor=1.0, out_indices=range(0, 7))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+
+    # Test MobileNetV2 with GroupNorm forward
+    model = MobileNetV2(
+        widen_factor=1.0,
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True),
+        out_indices=range(0, 7))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
+
+    # Test MobileNetV2 with layers 1, 3, 5 out forward
+    model = MobileNetV2(widen_factor=1.0, out_indices=(0, 2, 4))
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 32, 28, 28))
+    assert feat[2].shape == torch.Size((1, 96, 14, 14))
+
+    # Test MobileNetV2 with checkpoint forward
+    model = MobileNetV2(
+        widen_factor=1.0, with_cp=True, out_indices=range(0, 7))
+    for m in model.modules():
+        if is_block(m):
+            assert m.with_cp
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size((1, 16, 112, 112))
+    assert feat[1].shape == torch.Size((1, 24, 56, 56))
+    assert feat[2].shape == torch.Size((1, 32, 28, 28))
+    assert feat[3].shape == torch.Size((1, 64, 14, 14))
+    assert feat[4].shape == torch.Size((1, 96, 14, 14))
+    assert feat[5].shape == torch.Size((1, 160, 7, 7))
+    assert feat[6].shape == torch.Size((1, 320, 7, 7))
diff --git a/tests/test_models/test_backbones/test_pvt.py b/tests/test_models/test_backbones/test_pvt.py
new file mode 100755
index 0000000..029fdb3
--- /dev/null
+++ b/tests/test_models/test_backbones/test_pvt.py
@@ -0,0 +1,103 @@
+import pytest
+import torch
+
+from mmdet.models.backbones.pvt import (PVTEncoderLayer,
+                                        PyramidVisionTransformer,
+                                        PyramidVisionTransformerV2)
+
+
+def test_pvt_block():
+    # test PVT structure and forward
+    block = PVTEncoderLayer(
+        embed_dims=64, num_heads=4, feedforward_channels=256)
+    assert block.ffn.embed_dims == 64
+    assert block.attn.num_heads == 4
+    assert block.ffn.feedforward_channels == 256
+    x = torch.randn(1, 56 * 56, 64)
+    x_out = block(x, (56, 56))
+    assert x_out.shape == torch.Size([1, 56 * 56, 64])
+
+
+def test_pvt():
+    """Test PVT backbone."""
+
+    with pytest.raises(TypeError):
+        # Pretrained arg must be str or None.
+        PyramidVisionTransformer(pretrained=123)
+
+    # test pretrained image size
+    with pytest.raises(AssertionError):
+        PyramidVisionTransformer(pretrain_img_size=(224, 224, 224))
+
+    # Test absolute position embedding
+    temp = torch.randn((1, 3, 224, 224))
+    model = PyramidVisionTransformer(
+        pretrain_img_size=224, use_abs_pos_embed=True)
+    model.init_weights()
+    model(temp)
+
+    # Test normal inference
+    temp = torch.randn((1, 3, 32, 32))
+    model = PyramidVisionTransformer()
+    outs = model(temp)
+    assert outs[0].shape == (1, 64, 8, 8)
+    assert outs[1].shape == (1, 128, 4, 4)
+    assert outs[2].shape == (1, 320, 2, 2)
+    assert outs[3].shape == (1, 512, 1, 1)
+
+    # Test abnormal inference size
+    temp = torch.randn((1, 3, 33, 33))
+    model = PyramidVisionTransformer()
+    outs = model(temp)
+    assert outs[0].shape == (1, 64, 8, 8)
+    assert outs[1].shape == (1, 128, 4, 4)
+    assert outs[2].shape == (1, 320, 2, 2)
+    assert outs[3].shape == (1, 512, 1, 1)
+
+    # Test abnormal inference size
+    temp = torch.randn((1, 3, 112, 137))
+    model = PyramidVisionTransformer()
+    outs = model(temp)
+    assert outs[0].shape == (1, 64, 28, 34)
+    assert outs[1].shape == (1, 128, 14, 17)
+    assert outs[2].shape == (1, 320, 7, 8)
+    assert outs[3].shape == (1, 512, 3, 4)
+
+
+def test_pvtv2():
+    """Test PVTv2 backbone."""
+
+    with pytest.raises(TypeError):
+        # Pretrained arg must be str or None.
+        PyramidVisionTransformerV2(pretrained=123)
+
+    # test pretrained image size
+    with pytest.raises(AssertionError):
+        PyramidVisionTransformerV2(pretrain_img_size=(224, 224, 224))
+
+    # Test normal inference
+    temp = torch.randn((1, 3, 32, 32))
+    model = PyramidVisionTransformerV2()
+    outs = model(temp)
+    assert outs[0].shape == (1, 64, 8, 8)
+    assert outs[1].shape == (1, 128, 4, 4)
+    assert outs[2].shape == (1, 320, 2, 2)
+    assert outs[3].shape == (1, 512, 1, 1)
+
+    # Test abnormal inference size
+    temp = torch.randn((1, 3, 31, 31))
+    model = PyramidVisionTransformerV2()
+    outs = model(temp)
+    assert outs[0].shape == (1, 64, 8, 8)
+    assert outs[1].shape == (1, 128, 4, 4)
+    assert outs[2].shape == (1, 320, 2, 2)
+    assert outs[3].shape == (1, 512, 1, 1)
+
+    # Test abnormal inference size
+    temp = torch.randn((1, 3, 112, 137))
+    model = PyramidVisionTransformerV2()
+    outs = model(temp)
+    assert outs[0].shape == (1, 64, 28, 35)
+    assert outs[1].shape == (1, 128, 14, 18)
+    assert outs[2].shape == (1, 320, 7, 9)
+    assert outs[3].shape == (1, 512, 4, 5)
diff --git a/tests/test_models/test_backbones/test_regnet.py b/tests/test_models/test_backbones/test_regnet.py
new file mode 100755
index 0000000..2f94b11
--- /dev/null
+++ b/tests/test_models/test_backbones/test_regnet.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.backbones import RegNet
+
+regnet_test_data = [
+    ('regnetx_400mf',
+     dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22,
+          bot_mul=1.0), [32, 64, 160, 384]),
+    ('regnetx_800mf',
+     dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16,
+          bot_mul=1.0), [64, 128, 288, 672]),
+    ('regnetx_1.6gf',
+     dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18,
+          bot_mul=1.0), [72, 168, 408, 912]),
+    ('regnetx_3.2gf',
+     dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25,
+          bot_mul=1.0), [96, 192, 432, 1008]),
+    ('regnetx_4.0gf',
+     dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23,
+          bot_mul=1.0), [80, 240, 560, 1360]),
+    ('regnetx_6.4gf',
+     dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17,
+          bot_mul=1.0), [168, 392, 784, 1624]),
+    ('regnetx_8.0gf',
+     dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23,
+          bot_mul=1.0), [80, 240, 720, 1920]),
+    ('regnetx_12gf',
+     dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19,
+          bot_mul=1.0), [224, 448, 896, 2240]),
+]
+
+
+@pytest.mark.parametrize('arch_name,arch,out_channels', regnet_test_data)
+def test_regnet_backbone(arch_name, arch, out_channels):
+    with pytest.raises(AssertionError):
+        # ResNeXt depth should be in [50, 101, 152]
+        RegNet(arch_name + '233')
+
+    # Test RegNet with arch_name
+    model = RegNet(arch_name)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, out_channels[0], 8, 8])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 4, 4])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 2, 2])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 1, 1])
+
+    # Test RegNet with arch
+    model = RegNet(arch)
+    assert feat[0].shape == torch.Size([1, out_channels[0], 8, 8])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 4, 4])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 2, 2])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 1, 1])
diff --git a/tests/test_models/test_backbones/test_renext.py b/tests/test_models/test_backbones/test_renext.py
new file mode 100755
index 0000000..4ce2ee6
--- /dev/null
+++ b/tests/test_models/test_backbones/test_renext.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.backbones import ResNeXt
+from mmdet.models.backbones.resnext import Bottleneck as BottleneckX
+from .utils import is_block
+
+
+def test_renext_bottleneck():
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        BottleneckX(64, 64, groups=32, base_width=4, style='tensorflow')
+
+    # Test ResNeXt Bottleneck structure
+    block = BottleneckX(
+        64, 64, groups=32, base_width=4, stride=2, style='pytorch')
+    assert block.conv2.stride == (2, 2)
+    assert block.conv2.groups == 32
+    assert block.conv2.out_channels == 128
+
+    # Test ResNeXt Bottleneck with DCN
+    dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
+    with pytest.raises(AssertionError):
+        # conv_cfg must be None if dcn is not None
+        BottleneckX(
+            64,
+            64,
+            groups=32,
+            base_width=4,
+            dcn=dcn,
+            conv_cfg=dict(type='Conv'))
+    BottleneckX(64, 64, dcn=dcn)
+
+    # Test ResNeXt Bottleneck forward
+    block = BottleneckX(64, 16, groups=32, base_width=4)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test ResNeXt Bottleneck forward with plugins
+    plugins = [
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            stages=(False, False, True, True),
+            position='after_conv2')
+    ]
+    block = BottleneckX(64, 16, groups=32, base_width=4, plugins=plugins)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_resnext_backbone():
+    with pytest.raises(KeyError):
+        # ResNeXt depth should be in [50, 101, 152]
+        ResNeXt(depth=18)
+
+    # Test ResNeXt with group 32, base_width 4
+    model = ResNeXt(depth=50, groups=32, base_width=4)
+    for m in model.modules():
+        if is_block(m):
+            assert m.conv2.groups == 32
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 256, 8, 8])
+    assert feat[1].shape == torch.Size([1, 512, 4, 4])
+    assert feat[2].shape == torch.Size([1, 1024, 2, 2])
+    assert feat[3].shape == torch.Size([1, 2048, 1, 1])
+
+
+regnet_test_data = [
+    ('regnetx_400mf',
+     dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22,
+          bot_mul=1.0), [32, 64, 160, 384]),
+    ('regnetx_800mf',
+     dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16,
+          bot_mul=1.0), [64, 128, 288, 672]),
+    ('regnetx_1.6gf',
+     dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18,
+          bot_mul=1.0), [72, 168, 408, 912]),
+    ('regnetx_3.2gf',
+     dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25,
+          bot_mul=1.0), [96, 192, 432, 1008]),
+    ('regnetx_4.0gf',
+     dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23,
+          bot_mul=1.0), [80, 240, 560, 1360]),
+    ('regnetx_6.4gf',
+     dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17,
+          bot_mul=1.0), [168, 392, 784, 1624]),
+    ('regnetx_8.0gf',
+     dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23,
+          bot_mul=1.0), [80, 240, 720, 1920]),
+    ('regnetx_12gf',
+     dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19,
+          bot_mul=1.0), [224, 448, 896, 2240]),
+]
diff --git a/tests/test_models/test_backbones/test_res2net.py b/tests/test_models/test_backbones/test_res2net.py
new file mode 100755
index 0000000..6757869
--- /dev/null
+++ b/tests/test_models/test_backbones/test_res2net.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.backbones import Res2Net
+from mmdet.models.backbones.res2net import Bottle2neck
+from .utils import is_block
+
+
+def test_res2net_bottle2neck():
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        Bottle2neck(64, 64, base_width=26, scales=4, style='tensorflow')
+
+    with pytest.raises(AssertionError):
+        # Scale must be larger than 1
+        Bottle2neck(64, 64, base_width=26, scales=1, style='pytorch')
+
+    # Test Res2Net Bottle2neck structure
+    block = Bottle2neck(
+        64, 64, base_width=26, stride=2, scales=4, style='pytorch')
+    assert block.scales == 4
+
+    # Test Res2Net Bottle2neck with DCN
+    dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
+    with pytest.raises(AssertionError):
+        # conv_cfg must be None if dcn is not None
+        Bottle2neck(
+            64,
+            64,
+            base_width=26,
+            scales=4,
+            dcn=dcn,
+            conv_cfg=dict(type='Conv'))
+    Bottle2neck(64, 64, dcn=dcn)
+
+    # Test Res2Net Bottle2neck forward
+    block = Bottle2neck(64, 16, base_width=26, scales=4)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_res2net_backbone():
+    with pytest.raises(KeyError):
+        # Res2Net depth should be in [50, 101, 152]
+        Res2Net(depth=18)
+
+    # Test Res2Net with scales 4, base_width 26
+    model = Res2Net(depth=50, scales=4, base_width=26)
+    for m in model.modules():
+        if is_block(m):
+            assert m.scales == 4
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 256, 8, 8])
+    assert feat[1].shape == torch.Size([1, 512, 4, 4])
+    assert feat[2].shape == torch.Size([1, 1024, 2, 2])
+    assert feat[3].shape == torch.Size([1, 2048, 1, 1])
diff --git a/tests/test_models/test_backbones/test_resnest.py b/tests/test_models/test_backbones/test_resnest.py
new file mode 100755
index 0000000..245fdfd
--- /dev/null
+++ b/tests/test_models/test_backbones/test_resnest.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.backbones import ResNeSt
+from mmdet.models.backbones.resnest import Bottleneck as BottleneckS
+
+
+def test_resnest_bottleneck():
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        BottleneckS(64, 64, radix=2, reduction_factor=4, style='tensorflow')
+
+    # Test ResNeSt Bottleneck structure
+    block = BottleneckS(
+        2, 4, radix=2, reduction_factor=4, stride=2, style='pytorch')
+    assert block.avd_layer.stride == 2
+    assert block.conv2.channels == 4
+
+    # Test ResNeSt Bottleneck forward
+    block = BottleneckS(16, 4, radix=2, reduction_factor=4)
+    x = torch.randn(2, 16, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([2, 16, 56, 56])
+
+
+def test_resnest_backbone():
+    with pytest.raises(KeyError):
+        # ResNeSt depth should be in [50, 101, 152, 200]
+        ResNeSt(depth=18)
+
+    # Test ResNeSt with radix 2, reduction_factor 4
+    model = ResNeSt(
+        depth=50,
+        base_channels=4,
+        radix=2,
+        reduction_factor=4,
+        out_indices=(0, 1, 2, 3))
+    model.train()
+
+    imgs = torch.randn(2, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([2, 16, 8, 8])
+    assert feat[1].shape == torch.Size([2, 32, 4, 4])
+    assert feat[2].shape == torch.Size([2, 64, 2, 2])
+    assert feat[3].shape == torch.Size([2, 128, 1, 1])
diff --git a/tests/test_models/test_backbones/test_resnet.py b/tests/test_models/test_backbones/test_resnet.py
new file mode 100755
index 0000000..5448828
--- /dev/null
+++ b/tests/test_models/test_backbones/test_resnet.py
@@ -0,0 +1,632 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmcv import assert_params_all_zeros
+from mmcv.ops import DeformConv2dPack
+from torch.nn.modules import AvgPool2d, GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.models.backbones import ResNet, ResNetV1d
+from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from mmdet.models.utils import ResLayer, SimplifiedBasicBlock
+from .utils import check_norm_state, is_block, is_norm
+
+
+def test_resnet_basic_block():
+    with pytest.raises(AssertionError):
+        # Not implemented yet.
+        dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
+        BasicBlock(64, 64, dcn=dcn)
+
+    with pytest.raises(AssertionError):
+        # Not implemented yet.
+        plugins = [
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv3')
+        ]
+        BasicBlock(64, 64, plugins=plugins)
+
+    with pytest.raises(AssertionError):
+        # Not implemented yet
+        plugins = [
+            dict(
+                cfg=dict(
+                    type='GeneralizedAttention',
+                    spatial_range=-1,
+                    num_heads=8,
+                    attention_type='0010',
+                    kv_stride=2),
+                position='after_conv2')
+        ]
+        BasicBlock(64, 64, plugins=plugins)
+
+    # test BasicBlock structure and forward
+    block = BasicBlock(64, 64)
+    assert block.conv1.in_channels == 64
+    assert block.conv1.out_channels == 64
+    assert block.conv1.kernel_size == (3, 3)
+    assert block.conv2.in_channels == 64
+    assert block.conv2.out_channels == 64
+    assert block.conv2.kernel_size == (3, 3)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test BasicBlock with checkpoint forward
+    block = BasicBlock(64, 64, with_cp=True)
+    assert block.with_cp
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_resnet_bottleneck():
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        Bottleneck(64, 64, style='tensorflow')
+
+    with pytest.raises(AssertionError):
+        # Allowed positions are 'after_conv1', 'after_conv2', 'after_conv3'
+        plugins = [
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv4')
+        ]
+        Bottleneck(64, 16, plugins=plugins)
+
+    with pytest.raises(AssertionError):
+        # Need to specify different postfix to avoid duplicate plugin name
+        plugins = [
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv3'),
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv3')
+        ]
+        Bottleneck(64, 16, plugins=plugins)
+
+    with pytest.raises(KeyError):
+        # Plugin type is not supported
+        plugins = [dict(cfg=dict(type='WrongPlugin'), position='after_conv3')]
+        Bottleneck(64, 16, plugins=plugins)
+
+    # Test Bottleneck with checkpoint forward
+    block = Bottleneck(64, 16, with_cp=True)
+    assert block.with_cp
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test Bottleneck style
+    block = Bottleneck(64, 64, stride=2, style='pytorch')
+    assert block.conv1.stride == (1, 1)
+    assert block.conv2.stride == (2, 2)
+    block = Bottleneck(64, 64, stride=2, style='caffe')
+    assert block.conv1.stride == (2, 2)
+    assert block.conv2.stride == (1, 1)
+
+    # Test Bottleneck DCN
+    dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
+    with pytest.raises(AssertionError):
+        Bottleneck(64, 64, dcn=dcn, conv_cfg=dict(type='Conv'))
+    block = Bottleneck(64, 64, dcn=dcn)
+    assert isinstance(block.conv2, DeformConv2dPack)
+
+    # Test Bottleneck forward
+    block = Bottleneck(64, 16)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test Bottleneck with 1 ContextBlock after conv3
+    plugins = [
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            position='after_conv3')
+    ]
+    block = Bottleneck(64, 16, plugins=plugins)
+    assert block.context_block.in_channels == 64
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test Bottleneck with 1 GeneralizedAttention after conv2
+    plugins = [
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            position='after_conv2')
+    ]
+    block = Bottleneck(64, 16, plugins=plugins)
+    assert block.gen_attention_block.in_channels == 16
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test Bottleneck with 1 GeneralizedAttention after conv2, 1 NonLocal2D
+    # after conv2, 1 ContextBlock after conv3
+    plugins = [
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            position='after_conv2'),
+        dict(cfg=dict(type='NonLocal2d'), position='after_conv2'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            position='after_conv3')
+    ]
+    block = Bottleneck(64, 16, plugins=plugins)
+    assert block.gen_attention_block.in_channels == 16
+    assert block.nonlocal_block.in_channels == 16
+    assert block.context_block.in_channels == 64
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test Bottleneck with 1 ContextBlock after conv2, 2 ContextBlock after
+    # conv3
+    plugins = [
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=1),
+            position='after_conv2'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=2),
+            position='after_conv3'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=3),
+            position='after_conv3')
+    ]
+    block = Bottleneck(64, 16, plugins=plugins)
+    assert block.context_block1.in_channels == 16
+    assert block.context_block2.in_channels == 64
+    assert block.context_block3.in_channels == 64
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_simplied_basic_block():
+    with pytest.raises(AssertionError):
+        # Not implemented yet.
+        dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
+        SimplifiedBasicBlock(64, 64, dcn=dcn)
+
+    with pytest.raises(AssertionError):
+        # Not implemented yet.
+        plugins = [
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv3')
+        ]
+        SimplifiedBasicBlock(64, 64, plugins=plugins)
+
+    with pytest.raises(AssertionError):
+        # Not implemented yet
+        plugins = [
+            dict(
+                cfg=dict(
+                    type='GeneralizedAttention',
+                    spatial_range=-1,
+                    num_heads=8,
+                    attention_type='0010',
+                    kv_stride=2),
+                position='after_conv2')
+        ]
+        SimplifiedBasicBlock(64, 64, plugins=plugins)
+
+    with pytest.raises(AssertionError):
+        # Not implemented yet
+        SimplifiedBasicBlock(64, 64, with_cp=True)
+
+    # test SimplifiedBasicBlock structure and forward
+    block = SimplifiedBasicBlock(64, 64)
+    assert block.conv1.in_channels == 64
+    assert block.conv1.out_channels == 64
+    assert block.conv1.kernel_size == (3, 3)
+    assert block.conv2.in_channels == 64
+    assert block.conv2.out_channels == 64
+    assert block.conv2.kernel_size == (3, 3)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # test SimplifiedBasicBlock without norm
+    block = SimplifiedBasicBlock(64, 64, norm_cfg=None)
+    assert block.norm1 is None
+    assert block.norm2 is None
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_resnet_res_layer():
+    # Test ResLayer of 3 Bottleneck w\o downsample
+    layer = ResLayer(Bottleneck, 64, 16, 3)
+    assert len(layer) == 3
+    assert layer[0].conv1.in_channels == 64
+    assert layer[0].conv1.out_channels == 16
+    for i in range(1, len(layer)):
+        assert layer[i].conv1.in_channels == 64
+        assert layer[i].conv1.out_channels == 16
+    for i in range(len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test ResLayer of 3 Bottleneck with downsample
+    layer = ResLayer(Bottleneck, 64, 64, 3)
+    assert layer[0].downsample[0].out_channels == 256
+    for i in range(1, len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 256, 56, 56])
+
+    # Test ResLayer of 3 Bottleneck with stride=2
+    layer = ResLayer(Bottleneck, 64, 64, 3, stride=2)
+    assert layer[0].downsample[0].out_channels == 256
+    assert layer[0].downsample[0].stride == (2, 2)
+    for i in range(1, len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 256, 28, 28])
+
+    # Test ResLayer of 3 Bottleneck with stride=2 and average downsample
+    layer = ResLayer(Bottleneck, 64, 64, 3, stride=2, avg_down=True)
+    assert isinstance(layer[0].downsample[0], AvgPool2d)
+    assert layer[0].downsample[1].out_channels == 256
+    assert layer[0].downsample[1].stride == (1, 1)
+    for i in range(1, len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 256, 28, 28])
+
+    # Test ResLayer of 3 BasicBlock with stride=2 and downsample_first=False
+    layer = ResLayer(BasicBlock, 64, 64, 3, stride=2, downsample_first=False)
+    assert layer[2].downsample[0].out_channels == 64
+    assert layer[2].downsample[0].stride == (2, 2)
+    for i in range(len(layer) - 1):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 64, 28, 28])
+
+
+def test_resnest_stem():
+    # Test default stem_channels
+    model = ResNet(50)
+    assert model.stem_channels == 64
+    assert model.conv1.out_channels == 64
+    assert model.norm1.num_features == 64
+
+    # Test default stem_channels, with base_channels=3
+    model = ResNet(50, base_channels=3)
+    assert model.stem_channels == 3
+    assert model.conv1.out_channels == 3
+    assert model.norm1.num_features == 3
+    assert model.layer1[0].conv1.in_channels == 3
+
+    # Test stem_channels=3
+    model = ResNet(50, stem_channels=3)
+    assert model.stem_channels == 3
+    assert model.conv1.out_channels == 3
+    assert model.norm1.num_features == 3
+    assert model.layer1[0].conv1.in_channels == 3
+
+    # Test stem_channels=3, with base_channels=2
+    model = ResNet(50, stem_channels=3, base_channels=2)
+    assert model.stem_channels == 3
+    assert model.conv1.out_channels == 3
+    assert model.norm1.num_features == 3
+    assert model.layer1[0].conv1.in_channels == 3
+
+    # Test V1d stem_channels
+    model = ResNetV1d(depth=50, stem_channels=6)
+    model.train()
+    assert model.stem[0].out_channels == 3
+    assert model.stem[1].num_features == 3
+    assert model.stem[3].out_channels == 3
+    assert model.stem[4].num_features == 3
+    assert model.stem[6].out_channels == 6
+    assert model.stem[7].num_features == 6
+    assert model.layer1[0].conv1.in_channels == 6
+
+
+def test_resnet_backbone():
+    """Test resnet backbone."""
+    with pytest.raises(KeyError):
+        # ResNet depth should be in [18, 34, 50, 101, 152]
+        ResNet(20)
+
+    with pytest.raises(AssertionError):
+        # In ResNet: 1 <= num_stages <= 4
+        ResNet(50, num_stages=0)
+
+    with pytest.raises(AssertionError):
+        # len(stage_with_dcn) == num_stages
+        dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
+        ResNet(50, dcn=dcn, stage_with_dcn=(True, ))
+
+    with pytest.raises(AssertionError):
+        # len(stage_with_plugin) == num_stages
+        plugins = [
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True),
+                position='after_conv3')
+        ]
+        ResNet(50, plugins=plugins)
+
+    with pytest.raises(AssertionError):
+        # In ResNet: 1 <= num_stages <= 4
+        ResNet(50, num_stages=5)
+
+    with pytest.raises(AssertionError):
+        # len(strides) == len(dilations) == num_stages
+        ResNet(50, strides=(1, ), dilations=(1, 1), num_stages=3)
+
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = ResNet(50, pretrained=0)
+
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        ResNet(50, style='tensorflow')
+
+    # Test ResNet50 norm_eval=True
+    model = ResNet(50, norm_eval=True, base_channels=1)
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test ResNet50 with torchvision pretrained weight
+    model = ResNet(
+        depth=50, norm_eval=True, pretrained='torchvision://resnet50')
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test ResNet50 with first stage frozen
+    frozen_stages = 1
+    model = ResNet(50, frozen_stages=frozen_stages, base_channels=1)
+    model.train()
+    assert model.norm1.training is False
+    for layer in [model.conv1, model.norm1]:
+        for param in layer.parameters():
+            assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(model, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test ResNet50V1d with first stage frozen
+    model = ResNetV1d(depth=50, frozen_stages=frozen_stages, base_channels=2)
+    assert len(model.stem) == 9
+    model.train()
+    assert check_norm_state(model.stem, False)
+    for param in model.stem.parameters():
+        assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(model, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test ResNet18 forward
+    model = ResNet(18)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 64, 8, 8])
+    assert feat[1].shape == torch.Size([1, 128, 4, 4])
+    assert feat[2].shape == torch.Size([1, 256, 2, 2])
+    assert feat[3].shape == torch.Size([1, 512, 1, 1])
+
+    # Test ResNet18 with checkpoint forward
+    model = ResNet(18, with_cp=True)
+    for m in model.modules():
+        if is_block(m):
+            assert m.with_cp
+
+    # Test ResNet50 with BatchNorm forward
+    model = ResNet(50, base_channels=1)
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 4, 8, 8])
+    assert feat[1].shape == torch.Size([1, 8, 4, 4])
+    assert feat[2].shape == torch.Size([1, 16, 2, 2])
+    assert feat[3].shape == torch.Size([1, 32, 1, 1])
+
+    # Test ResNet50 with layers 1, 2, 3 out forward
+    model = ResNet(50, out_indices=(0, 1, 2), base_channels=1)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size([1, 4, 8, 8])
+    assert feat[1].shape == torch.Size([1, 8, 4, 4])
+    assert feat[2].shape == torch.Size([1, 16, 2, 2])
+
+    # Test ResNet50 with checkpoint forward
+    model = ResNet(50, with_cp=True, base_channels=1)
+    for m in model.modules():
+        if is_block(m):
+            assert m.with_cp
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 4, 8, 8])
+    assert feat[1].shape == torch.Size([1, 8, 4, 4])
+    assert feat[2].shape == torch.Size([1, 16, 2, 2])
+    assert feat[3].shape == torch.Size([1, 32, 1, 1])
+
+    # Test ResNet50 with GroupNorm forward
+    model = ResNet(
+        50,
+        base_channels=4,
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 16, 8, 8])
+    assert feat[1].shape == torch.Size([1, 32, 4, 4])
+    assert feat[2].shape == torch.Size([1, 64, 2, 2])
+    assert feat[3].shape == torch.Size([1, 128, 1, 1])
+
+    # Test ResNet50 with 1 GeneralizedAttention after conv2, 1 NonLocal2D
+    # after conv2, 1 ContextBlock after conv3 in layers 2, 3, 4
+    plugins = [
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            stages=(False, True, True, True),
+            position='after_conv2'),
+        dict(cfg=dict(type='NonLocal2d'), position='after_conv2'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            stages=(False, True, True, False),
+            position='after_conv3')
+    ]
+    model = ResNet(50, plugins=plugins, base_channels=8)
+    for m in model.layer1.modules():
+        if is_block(m):
+            assert not hasattr(m, 'context_block')
+            assert not hasattr(m, 'gen_attention_block')
+            assert m.nonlocal_block.in_channels == 8
+    for m in model.layer2.modules():
+        if is_block(m):
+            assert m.nonlocal_block.in_channels == 16
+            assert m.gen_attention_block.in_channels == 16
+            assert m.context_block.in_channels == 64
+
+    for m in model.layer3.modules():
+        if is_block(m):
+            assert m.nonlocal_block.in_channels == 32
+            assert m.gen_attention_block.in_channels == 32
+            assert m.context_block.in_channels == 128
+
+    for m in model.layer4.modules():
+        if is_block(m):
+            assert m.nonlocal_block.in_channels == 64
+            assert m.gen_attention_block.in_channels == 64
+            assert not hasattr(m, 'context_block')
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 32, 8, 8])
+    assert feat[1].shape == torch.Size([1, 64, 4, 4])
+    assert feat[2].shape == torch.Size([1, 128, 2, 2])
+    assert feat[3].shape == torch.Size([1, 256, 1, 1])
+
+    # Test ResNet50 with 1 ContextBlock after conv2, 1 ContextBlock after
+    # conv3 in layers 2, 3, 4
+    plugins = [
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=1),
+            stages=(False, True, True, False),
+            position='after_conv3'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=2),
+            stages=(False, True, True, False),
+            position='after_conv3')
+    ]
+
+    model = ResNet(50, plugins=plugins, base_channels=8)
+    for m in model.layer1.modules():
+        if is_block(m):
+            assert not hasattr(m, 'context_block')
+            assert not hasattr(m, 'context_block1')
+            assert not hasattr(m, 'context_block2')
+    for m in model.layer2.modules():
+        if is_block(m):
+            assert not hasattr(m, 'context_block')
+            assert m.context_block1.in_channels == 64
+            assert m.context_block2.in_channels == 64
+
+    for m in model.layer3.modules():
+        if is_block(m):
+            assert not hasattr(m, 'context_block')
+            assert m.context_block1.in_channels == 128
+            assert m.context_block2.in_channels == 128
+
+    for m in model.layer4.modules():
+        if is_block(m):
+            assert not hasattr(m, 'context_block')
+            assert not hasattr(m, 'context_block1')
+            assert not hasattr(m, 'context_block2')
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 32, 8, 8])
+    assert feat[1].shape == torch.Size([1, 64, 4, 4])
+    assert feat[2].shape == torch.Size([1, 128, 2, 2])
+    assert feat[3].shape == torch.Size([1, 256, 1, 1])
+
+    # Test ResNet50 zero initialization of residual
+    model = ResNet(50, zero_init_residual=True, base_channels=1)
+    model.init_weights()
+    for m in model.modules():
+        if isinstance(m, Bottleneck):
+            assert assert_params_all_zeros(m.norm3)
+        elif isinstance(m, BasicBlock):
+            assert assert_params_all_zeros(m.norm2)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 4, 8, 8])
+    assert feat[1].shape == torch.Size([1, 8, 4, 4])
+    assert feat[2].shape == torch.Size([1, 16, 2, 2])
+    assert feat[3].shape == torch.Size([1, 32, 1, 1])
+
+    # Test ResNetV1d forward
+    model = ResNetV1d(depth=50, base_channels=2)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 8, 8, 8])
+    assert feat[1].shape == torch.Size([1, 16, 4, 4])
+    assert feat[2].shape == torch.Size([1, 32, 2, 2])
+    assert feat[3].shape == torch.Size([1, 64, 1, 1])
diff --git a/tests/test_models/test_backbones/test_swin.py b/tests/test_models/test_backbones/test_swin.py
new file mode 100755
index 0000000..5369ef2
--- /dev/null
+++ b/tests/test_models/test_backbones/test_swin.py
@@ -0,0 +1,87 @@
+import pytest
+import torch
+
+from mmdet.models.backbones.swin import SwinBlock, SwinTransformer
+
+
+def test_swin_block():
+    # test SwinBlock structure and forward
+    block = SwinBlock(embed_dims=64, num_heads=4, feedforward_channels=256)
+    assert block.ffn.embed_dims == 64
+    assert block.attn.w_msa.num_heads == 4
+    assert block.ffn.feedforward_channels == 256
+    x = torch.randn(1, 56 * 56, 64)
+    x_out = block(x, (56, 56))
+    assert x_out.shape == torch.Size([1, 56 * 56, 64])
+
+    # Test BasicBlock with checkpoint forward
+    block = SwinBlock(
+        embed_dims=64, num_heads=4, feedforward_channels=256, with_cp=True)
+    assert block.with_cp
+    x = torch.randn(1, 56 * 56, 64)
+    x_out = block(x, (56, 56))
+    assert x_out.shape == torch.Size([1, 56 * 56, 64])
+
+
+def test_swin_transformer():
+    """Test Swin Transformer backbone."""
+
+    with pytest.raises(TypeError):
+        # Pretrained arg must be str or None.
+        SwinTransformer(pretrained=123)
+
+    with pytest.raises(AssertionError):
+        # Because swin uses non-overlapping patch embed, so the stride of patch
+        # embed must be equal to patch size.
+        SwinTransformer(strides=(2, 2, 2, 2), patch_size=4)
+
+    # test pretrained image size
+    with pytest.raises(AssertionError):
+        SwinTransformer(pretrain_img_size=(224, 224, 224))
+
+    # Test absolute position embedding
+    temp = torch.randn((1, 3, 224, 224))
+    model = SwinTransformer(pretrain_img_size=224, use_abs_pos_embed=True)
+    model.init_weights()
+    model(temp)
+    # Test different inputs when use absolute position embedding
+    temp = torch.randn((1, 3, 112, 112))
+    model(temp)
+    temp = torch.randn((1, 3, 256, 256))
+    model(temp)
+
+    # Test patch norm
+    model = SwinTransformer(patch_norm=False)
+    model(temp)
+
+    # Test normal inference
+    temp = torch.randn((1, 3, 32, 32))
+    model = SwinTransformer()
+    outs = model(temp)
+    assert outs[0].shape == (1, 96, 8, 8)
+    assert outs[1].shape == (1, 192, 4, 4)
+    assert outs[2].shape == (1, 384, 2, 2)
+    assert outs[3].shape == (1, 768, 1, 1)
+
+    # Test abnormal inference size
+    temp = torch.randn((1, 3, 31, 31))
+    model = SwinTransformer()
+    outs = model(temp)
+    assert outs[0].shape == (1, 96, 8, 8)
+    assert outs[1].shape == (1, 192, 4, 4)
+    assert outs[2].shape == (1, 384, 2, 2)
+    assert outs[3].shape == (1, 768, 1, 1)
+
+    # Test abnormal inference size
+    temp = torch.randn((1, 3, 112, 137))
+    model = SwinTransformer()
+    outs = model(temp)
+    assert outs[0].shape == (1, 96, 28, 35)
+    assert outs[1].shape == (1, 192, 14, 18)
+    assert outs[2].shape == (1, 384, 7, 9)
+    assert outs[3].shape == (1, 768, 4, 5)
+
+    model = SwinTransformer(frozen_stages=4)
+    model.train()
+    for p in model.parameters():
+        assert not p.requires_grad
diff --git a/tests/test_models/test_backbones/test_trident_resnet.py b/tests/test_models/test_backbones/test_trident_resnet.py
new file mode 100755
index 0000000..a79b97e
--- /dev/null
+++ b/tests/test_models/test_backbones/test_trident_resnet.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.backbones import TridentResNet
+from mmdet.models.backbones.trident_resnet import TridentBottleneck
+
+
+def test_trident_resnet_bottleneck():
+    trident_dilations = (1, 2, 3)
+    test_branch_idx = 1
+    concat_output = True
+    trident_build_config = (trident_dilations, test_branch_idx, concat_output)
+
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        TridentBottleneck(
+            *trident_build_config, inplanes=64, planes=64, style='tensorflow')
+
+    with pytest.raises(AssertionError):
+        # Allowed positions are 'after_conv1', 'after_conv2', 'after_conv3'
+        plugins = [
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv4')
+        ]
+        TridentBottleneck(
+            *trident_build_config, inplanes=64, planes=16, plugins=plugins)
+
+    with pytest.raises(AssertionError):
+        # Need to specify different postfix to avoid duplicate plugin name
+        plugins = [
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv3'),
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                position='after_conv3')
+        ]
+        TridentBottleneck(
+            *trident_build_config, inplanes=64, planes=16, plugins=plugins)
+
+    with pytest.raises(KeyError):
+        # Plugin type is not supported
+        plugins = [dict(cfg=dict(type='WrongPlugin'), position='after_conv3')]
+        TridentBottleneck(
+            *trident_build_config, inplanes=64, planes=16, plugins=plugins)
+
+    # Test Bottleneck with checkpoint forward
+    block = TridentBottleneck(
+        *trident_build_config, inplanes=64, planes=16, with_cp=True)
+    assert block.with_cp
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([block.num_branch, 64, 56, 56])
+
+    # Test Bottleneck style
+    block = TridentBottleneck(
+        *trident_build_config,
+        inplanes=64,
+        planes=64,
+        stride=2,
+        style='pytorch')
+    assert block.conv1.stride == (1, 1)
+    assert block.conv2.stride == (2, 2)
+    block = TridentBottleneck(
+        *trident_build_config, inplanes=64, planes=64, stride=2, style='caffe')
+    assert block.conv1.stride == (2, 2)
+    assert block.conv2.stride == (1, 1)
+
+    # Test Bottleneck forward
+    block = TridentBottleneck(*trident_build_config, inplanes=64, planes=16)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([block.num_branch, 64, 56, 56])
+
+    # Test Bottleneck with 1 ContextBlock after conv3
+    plugins = [
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            position='after_conv3')
+    ]
+    block = TridentBottleneck(
+        *trident_build_config, inplanes=64, planes=16, plugins=plugins)
+    assert block.context_block.in_channels == 64
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([block.num_branch, 64, 56, 56])
+
+    # Test Bottleneck with 1 GeneralizedAttention after conv2
+    plugins = [
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            position='after_conv2')
+    ]
+    block = TridentBottleneck(
+        *trident_build_config, inplanes=64, planes=16, plugins=plugins)
+    assert block.gen_attention_block.in_channels == 16
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([block.num_branch, 64, 56, 56])
+
+    # Test Bottleneck with 1 GeneralizedAttention after conv2, 1 NonLocal2D
+    # after conv2, 1 ContextBlock after conv3
+    plugins = [
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            position='after_conv2'),
+        dict(cfg=dict(type='NonLocal2d'), position='after_conv2'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            position='after_conv3')
+    ]
+    block = TridentBottleneck(
+        *trident_build_config, inplanes=64, planes=16, plugins=plugins)
+    assert block.gen_attention_block.in_channels == 16
+    assert block.nonlocal_block.in_channels == 16
+    assert block.context_block.in_channels == 64
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([block.num_branch, 64, 56, 56])
+
+    # Test Bottleneck with 1 ContextBlock after conv2, 2 ContextBlock after
+    # conv3
+    plugins = [
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=1),
+            position='after_conv2'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=2),
+            position='after_conv3'),
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16, postfix=3),
+            position='after_conv3')
+    ]
+    block = TridentBottleneck(
+        *trident_build_config, inplanes=64, planes=16, plugins=plugins)
+    assert block.context_block1.in_channels == 16
+    assert block.context_block2.in_channels == 64
+    assert block.context_block3.in_channels == 64
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([block.num_branch, 64, 56, 56])
+
+
+def test_trident_resnet_backbone():
+    tridentresnet_config = dict(
+        num_branch=3,
+        test_branch_idx=1,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        trident_dilations=(1, 2, 3),
+        out_indices=(2, ),
+    )
+    """Test tridentresnet backbone."""
+    with pytest.raises(AssertionError):
+        # TridentResNet depth should be in [50, 101, 152]
+        TridentResNet(18, **tridentresnet_config)
+
+    with pytest.raises(AssertionError):
+        # In TridentResNet: num_stages == 3
+        TridentResNet(50, num_stages=4, **tridentresnet_config)
+
+    model = TridentResNet(50, num_stages=3, **tridentresnet_config)
+    model.train()
+
+    imgs = torch.randn(1, 3, 32, 32)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([3, 1024, 2, 2])
diff --git a/tests/test_models/test_backbones/utils.py b/tests/test_models/test_backbones/utils.py
new file mode 100755
index 0000000..9baa994
--- /dev/null
+++ b/tests/test_models/test_backbones/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.models.backbones.res2net import Bottle2neck
+from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from mmdet.models.backbones.resnext import Bottleneck as BottleneckX
+from mmdet.models.utils import SimplifiedBasicBlock
+
+
+def is_block(modules):
+    """Check if is ResNet building block."""
+    if isinstance(modules, (BasicBlock, Bottleneck, BottleneckX, Bottle2neck,
+                            SimplifiedBasicBlock)):
+        return True
+    return False
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
diff --git a/tests/test_models/test_dense_heads/test_anchor_head.py b/tests/test_models/test_dense_heads/test_anchor_head.py
new file mode 100755
index 0000000..7414be3
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_anchor_head.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import AnchorHead
+
+
+def test_anchor_head_loss():
+    """Tests anchor head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False))
+    self = AnchorHead(num_classes=4, in_channels=1, train_cfg=cfg)
+
+    # Anchor head expects a multiple levels of features per image
+    feat = [
+        torch.rand(1, 1, s // (2**(i + 2)), s // (2**(i + 2)))
+        for i in range(len(self.anchor_generator.strides))
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_ascend_head.py b/tests/test_models/test_dense_heads/test_ascend_head.py
new file mode 100755
index 0000000..843a55f
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_ascend_head.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import (AscendAnchorHead, AscendRetinaHead,
+                                      AscendSSDHead)
+
+
+def test_ascend_anchor_head_loss():
+    """Tests AscendAnchorHead loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='AscendMaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    self = AscendAnchorHead(num_classes=4, in_channels=1, train_cfg=cfg)
+
+    # Anchor head expects a multiple levels of features per image
+    feat = [
+        torch.rand(1, 1, s // (2**(i + 2)), s // (2**(i + 2)))
+        for i in range(len(self.prior_generator.strides))
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+
+
+def test_ascend_retina_head_loss():
+    """Tests AscendRetinaHead loss when truth is empty and non-empty."""
+    img_shape = (800, 1067, 3)
+    pad_shape = (800, 1088, 3)
+    num_classes = 80
+    in_channels = 256
+
+    img_metas = [{
+        'img_shape': img_shape,
+        'scale_factor': 1,
+        'pad_shape': pad_shape
+    }]
+
+    cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='AscendMaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    self = AscendRetinaHead(
+        num_classes=num_classes, in_channels=in_channels, train_cfg=cfg)
+
+    # Anchor head expects a multiple levels of features per image
+    feat = [
+        torch.rand(1, in_channels, pad_shape[0] // strides[0],
+                   pad_shape[1] // strides[1])
+        for strides in self.prior_generator.strides
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+
+
+def test_ascend_ssd_head_loss():
+    """Tests anchor head loss when truth is empty and non-empty."""
+    img_shape = (320, 320, 3)
+    pad_shape = (320, 320, 3)
+    in_channels = (96, 1280, 512, 256, 256, 128)
+    img_metas = [{
+        'img_shape': img_shape,
+        'scale_factor': 1,
+        'pad_shape': pad_shape
+    }, {
+        'img_shape': img_shape,
+        'scale_factor': 1,
+        'pad_shape': pad_shape
+    }]
+
+    self = AscendSSDHead(
+        in_channels=in_channels,
+        num_classes=80,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        act_cfg=dict(type='ReLU6'),
+        init_cfg=dict(type='Normal', layer='Conv2d', std=0.001),
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            strides=[16, 32, 64, 107, 160, 320],
+            ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
+            min_sizes=[48, 100, 150, 202, 253, 304],
+            max_sizes=[100, 150, 202, 253, 304, 320]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        train_cfg=mmcv.Config(
+            dict(
+                assigner=dict(
+                    type='AscendMaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.,
+                    ignore_iof_thr=-1,
+                    gt_max_assign_all=False),
+                smoothl1_beta=1.,
+                allowed_border=-1,
+                pos_weight=-1,
+                neg_pos_ratio=3,
+                debug=False)))
+
+    # Anchor head expects a multiple levels of features per image
+    feat = [
+        torch.rand(2, in_channels[i],
+                   round(pad_shape[0] / self.prior_generator.strides[i][0]),
+                   round(pad_shape[1] / self.prior_generator.strides[i][1]))
+        for i in range(len(self.prior_generator.strides))
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4)), torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([]), torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    assert empty_cls_loss.item() >= 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2]), torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_atss_head.py b/tests/test_models/test_dense_heads/test_atss_head.py
new file mode 100755
index 0000000..18597f4
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_atss_head.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import ATSSHead
+
+
+def test_atss_head_loss():
+    """Tests atss head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    self = ATSSHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    cls_scores, bbox_preds, centernesses = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, centernesses,
+                                gt_bboxes, gt_labels, img_metas,
+                                gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    empty_centerness_loss = sum(empty_gt_losses['loss_centerness'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_centerness_loss.item() == 0, (
+        'there should be no centerness loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, centernesses, gt_bboxes,
+                              gt_labels, img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    onegt_centerness_loss = sum(one_gt_losses['loss_centerness'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+    assert onegt_centerness_loss.item() > 0, (
+        'centerness loss should be non-zero')
diff --git a/tests/test_models/test_dense_heads/test_autoassign_head.py b/tests/test_models/test_dense_heads/test_autoassign_head.py
new file mode 100755
index 0000000..3c8491f
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_autoassign_head.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads.autoassign_head import AutoAssignHead
+from mmdet.models.dense_heads.paa_head import levels_to_images
+
+
+def test_autoassign_head_loss():
+    """Tests autoassign head loss when truth is empty and non-empty."""
+
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(assigner=None, allowed_border=-1, pos_weight=-1, debug=False))
+    self = AutoAssignHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    self.init_weights()
+    cls_scores, bbox_preds, objectnesses = self(feat)
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, objectnesses,
+                                gt_bboxes, gt_labels, img_metas,
+                                gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_pos_loss = empty_gt_losses['loss_pos']
+    empty_neg_loss = empty_gt_losses['loss_neg']
+    empty_center_loss = empty_gt_losses['loss_center']
+    assert empty_neg_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_pos_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_center_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, objectnesses, gt_bboxes,
+                              gt_labels, img_metas, gt_bboxes_ignore)
+    onegt_pos_loss = one_gt_losses['loss_pos']
+    onegt_neg_loss = one_gt_losses['loss_neg']
+    onegt_center_loss = one_gt_losses['loss_center']
+    assert onegt_pos_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_neg_loss.item() > 0, 'box loss should be non-zero'
+    assert onegt_center_loss.item() > 0, 'box loss should be non-zero'
+    n, c, h, w = 10, 4, 20, 20
+    mlvl_tensor = [torch.ones(n, c, h, w) for i in range(5)]
+    results = levels_to_images(mlvl_tensor)
+    assert len(results) == n
+    assert results[0].size() == (h * w * 5, c)
+
+    self = AutoAssignHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        strides=(4, ))
+    cls_scores = [torch.ones(2, 4, 5, 5)]
+    bbox_preds = [torch.ones(2, 4, 5, 5)]
+    iou_preds = [torch.ones(2, 1, 5, 5)]
+    cfg = mmcv.Config(
+        dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=100))
+    rescale = False
+    self.get_bboxes(
+        cls_scores, bbox_preds, iou_preds, img_metas, cfg, rescale=rescale)
diff --git a/tests/test_models/test_dense_heads/test_centernet_head.py b/tests/test_models/test_dense_heads/test_centernet_head.py
new file mode 100755
index 0000000..8993a48
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_centernet_head.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv import ConfigDict
+
+from mmdet.models.dense_heads import CenterNetHead
+
+
+def test_center_head_loss():
+    """Tests center head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    test_cfg = dict(topK=100, max_per_img=100)
+    self = CenterNetHead(
+        num_classes=4, in_channel=1, feat_channel=4, test_cfg=test_cfg)
+
+    feat = [torch.rand(1, 1, s, s)]
+    center_out, wh_out, offset_out = self.forward(feat)
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(center_out, wh_out, offset_out, gt_bboxes,
+                                gt_labels, img_metas, gt_bboxes_ignore)
+    loss_center = empty_gt_losses['loss_center_heatmap']
+    loss_wh = empty_gt_losses['loss_wh']
+    loss_offset = empty_gt_losses['loss_offset']
+    assert loss_center.item() > 0, 'loss_center should be non-zero'
+    assert loss_wh.item() == 0, (
+        'there should be no loss_wh when there are no true boxes')
+    assert loss_offset.item() == 0, (
+        'there should be no loss_offset when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(center_out, wh_out, offset_out, gt_bboxes,
+                              gt_labels, img_metas, gt_bboxes_ignore)
+    loss_center = one_gt_losses['loss_center_heatmap']
+    loss_wh = one_gt_losses['loss_wh']
+    loss_offset = one_gt_losses['loss_offset']
+    assert loss_center.item() > 0, 'loss_center should be non-zero'
+    assert loss_wh.item() > 0, 'loss_wh should be non-zero'
+    assert loss_offset.item() > 0, 'loss_offset should be non-zero'
+
+
+def test_centernet_head_get_bboxes():
+    """Tests center head generating and decoding the heatmap."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': np.array([1., 1., 1., 1.]),
+        'pad_shape': (s, s, 3),
+        'batch_input_shape': (s, s),
+        'border': (0, 0, 0, 0),
+        'flip': False
+    }]
+    test_cfg = ConfigDict(
+        dict(topk=100, local_maximum_kernel=3, max_per_img=100))
+    gt_bboxes = [
+        torch.Tensor([[10, 20, 200, 240], [40, 50, 100, 200],
+                      [10, 20, 100, 240]])
+    ]
+    gt_labels = [torch.LongTensor([1, 1, 2])]
+
+    self = CenterNetHead(
+        num_classes=4, in_channel=1, feat_channel=4, test_cfg=test_cfg)
+    self.feat_shape = (1, 1, s // 4, s // 4)
+    targets, _ = self.get_targets(gt_bboxes, gt_labels, self.feat_shape,
+                                  img_metas[0]['pad_shape'])
+    center_target = targets['center_heatmap_target']
+    wh_target = targets['wh_target']
+    offset_target = targets['offset_target']
+    # make sure assign target right
+    for i in range(len(gt_bboxes[0])):
+        bbox, label = gt_bboxes[0][i] / 4, gt_labels[0][i]
+        ctx, cty = sum(bbox[0::2]) / 2, sum(bbox[1::2]) / 2
+        int_ctx, int_cty = int(sum(bbox[0::2]) / 2), int(sum(bbox[1::2]) / 2)
+        w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
+        x_off = ctx - int(ctx)
+        y_off = cty - int(cty)
+        assert center_target[0, label, int_cty, int_ctx] == 1
+        assert wh_target[0, 0, int_cty, int_ctx] == w
+        assert wh_target[0, 1, int_cty, int_ctx] == h
+        assert offset_target[0, 0, int_cty, int_ctx] == x_off
+        assert offset_target[0, 1, int_cty, int_ctx] == y_off
+    # make sure get_bboxes is right
+    detections = self.get_bboxes([center_target], [wh_target], [offset_target],
+                                 img_metas,
+                                 rescale=True,
+                                 with_nms=False)
+    out_bboxes = detections[0][0][:3]
+    out_clses = detections[0][1][:3]
+    for bbox, cls in zip(out_bboxes, out_clses):
+        flag = False
+        for gt_bbox, gt_cls in zip(gt_bboxes[0], gt_labels[0]):
+            if (bbox[:4] == gt_bbox[:4]).all():
+                flag = True
+        assert flag, 'get_bboxes is wrong'
diff --git a/tests/test_models/test_dense_heads/test_corner_head.py b/tests/test_models/test_dense_heads/test_corner_head.py
new file mode 100755
index 0000000..0b549ff
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_corner_head.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.models.dense_heads import CornerHead
+
+
+def test_corner_head_loss():
+    """Tests corner head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    self = CornerHead(num_classes=4, in_channels=1)
+
+    # Corner head expects a multiple levels of features per image
+    feat = [
+        torch.rand(1, 1, s // 4, s // 4) for _ in range(self.num_feat_levels)
+    ]
+    tl_heats, br_heats, tl_embs, br_embs, tl_offs, br_offs = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(tl_heats, br_heats, tl_embs, br_embs, tl_offs,
+                                br_offs, gt_bboxes, gt_labels, img_metas,
+                                gt_bboxes_ignore)
+    empty_det_loss = sum(empty_gt_losses['det_loss'])
+    empty_push_loss = sum(empty_gt_losses['push_loss'])
+    empty_pull_loss = sum(empty_gt_losses['pull_loss'])
+    empty_off_loss = sum(empty_gt_losses['off_loss'])
+    assert empty_det_loss.item() > 0, 'det loss should be non-zero'
+    assert empty_push_loss.item() == 0, (
+        'there should be no push loss when there are no true boxes')
+    assert empty_pull_loss.item() == 0, (
+        'there should be no pull loss when there are no true boxes')
+    assert empty_off_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(tl_heats, br_heats, tl_embs, br_embs, tl_offs,
+                              br_offs, gt_bboxes, gt_labels, img_metas,
+                              gt_bboxes_ignore)
+    onegt_det_loss = sum(one_gt_losses['det_loss'])
+    onegt_push_loss = sum(one_gt_losses['push_loss'])
+    onegt_pull_loss = sum(one_gt_losses['pull_loss'])
+    onegt_off_loss = sum(one_gt_losses['off_loss'])
+    assert onegt_det_loss.item() > 0, 'det loss should be non-zero'
+    assert onegt_push_loss.item() == 0, (
+        'there should be no push loss when there are only one true box')
+    assert onegt_pull_loss.item() > 0, 'pull loss should be non-zero'
+    assert onegt_off_loss.item() > 0, 'off loss should be non-zero'
+
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874],
+                      [123.6667, 123.8757, 138.6326, 251.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2, 3])]
+
+    # equalize the corners' embedding value of different objects to make the
+    # push_loss larger than 0
+    gt_bboxes_ind = (gt_bboxes[0] // 4).int().tolist()
+    for tl_emb_feat, br_emb_feat in zip(tl_embs, br_embs):
+        tl_emb_feat[:, :, gt_bboxes_ind[0][1],
+                    gt_bboxes_ind[0][0]] = tl_emb_feat[:, :,
+                                                       gt_bboxes_ind[1][1],
+                                                       gt_bboxes_ind[1][0]]
+        br_emb_feat[:, :, gt_bboxes_ind[0][3],
+                    gt_bboxes_ind[0][2]] = br_emb_feat[:, :,
+                                                       gt_bboxes_ind[1][3],
+                                                       gt_bboxes_ind[1][2]]
+
+    two_gt_losses = self.loss(tl_heats, br_heats, tl_embs, br_embs, tl_offs,
+                              br_offs, gt_bboxes, gt_labels, img_metas,
+                              gt_bboxes_ignore)
+    twogt_det_loss = sum(two_gt_losses['det_loss'])
+    twogt_push_loss = sum(two_gt_losses['push_loss'])
+    twogt_pull_loss = sum(two_gt_losses['pull_loss'])
+    twogt_off_loss = sum(two_gt_losses['off_loss'])
+    assert twogt_det_loss.item() > 0, 'det loss should be non-zero'
+    assert twogt_push_loss.item() > 0, 'push loss should be non-zero'
+    assert twogt_pull_loss.item() > 0, 'pull loss should be non-zero'
+    assert twogt_off_loss.item() > 0, 'off loss should be non-zero'
+
+
+def test_corner_head_encode_and_decode_heatmap():
+    """Tests corner head generating and decoding the heatmap."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3),
+        'border': (0, 0, 0, 0)
+    }]
+
+    gt_bboxes = [
+        torch.Tensor([[10, 20, 200, 240], [40, 50, 100, 200],
+                      [10, 20, 200, 240]])
+    ]
+    gt_labels = [torch.LongTensor([1, 1, 2])]
+
+    self = CornerHead(num_classes=4, in_channels=1, corner_emb_channels=1)
+
+    feat = [
+        torch.rand(1, 1, s // 4, s // 4) for _ in range(self.num_feat_levels)
+    ]
+
+    targets = self.get_targets(
+        gt_bboxes,
+        gt_labels,
+        feat[0].shape,
+        img_metas[0]['pad_shape'],
+        with_corner_emb=self.with_corner_emb)
+
+    gt_tl_heatmap = targets['topleft_heatmap']
+    gt_br_heatmap = targets['bottomright_heatmap']
+    gt_tl_offset = targets['topleft_offset']
+    gt_br_offset = targets['bottomright_offset']
+    embedding = targets['corner_embedding']
+    [top, left], [bottom, right] = embedding[0][0]
+    gt_tl_embedding_heatmap = torch.zeros([1, 1, s // 4, s // 4])
+    gt_br_embedding_heatmap = torch.zeros([1, 1, s // 4, s // 4])
+    gt_tl_embedding_heatmap[0, 0, top, left] = 1
+    gt_br_embedding_heatmap[0, 0, bottom, right] = 1
+
+    batch_bboxes, batch_scores, batch_clses = self.decode_heatmap(
+        tl_heat=gt_tl_heatmap,
+        br_heat=gt_br_heatmap,
+        tl_off=gt_tl_offset,
+        br_off=gt_br_offset,
+        tl_emb=gt_tl_embedding_heatmap,
+        br_emb=gt_br_embedding_heatmap,
+        img_meta=img_metas[0],
+        k=100,
+        kernel=3,
+        distance_threshold=0.5)
+
+    bboxes = batch_bboxes.view(-1, 4)
+    scores = batch_scores.view(-1, 1)
+    clses = batch_clses.view(-1, 1)
+
+    idx = scores.argsort(dim=0, descending=True)
+    bboxes = bboxes[idx].view(-1, 4)
+    scores = scores[idx].view(-1)
+    clses = clses[idx].view(-1)
+
+    valid_bboxes = bboxes[torch.where(scores > 0.05)]
+    valid_labels = clses[torch.where(scores > 0.05)]
+    max_coordinate = valid_bboxes.max()
+    offsets = valid_labels.to(valid_bboxes) * (max_coordinate + 1)
+    gt_offsets = gt_labels[0].to(gt_bboxes[0]) * (max_coordinate + 1)
+
+    offset_bboxes = valid_bboxes + offsets[:, None]
+    offset_gtbboxes = gt_bboxes[0] + gt_offsets[:, None]
+
+    iou_matrix = bbox_overlaps(offset_bboxes.numpy(), offset_gtbboxes.numpy())
+    assert (iou_matrix == 1).sum() == 3
diff --git a/tests/test_models/test_dense_heads/test_ddod_head.py b/tests/test_models/test_dense_heads/test_ddod_head.py
new file mode 100755
index 0000000..c9e658e
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_ddod_head.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import DDODHead
+
+
+def test_ddod_head_loss():
+    """Tests ddod head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(  # ATSSAssigner
+            assigner=dict(type='ATSSAssigner', topk=9, alpha=0.8),
+            reg_assigner=dict(type='ATSSAssigner', topk=9, alpha=0.5),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    self = DDODHead(
+        num_classes=4,
+        in_channels=1,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        train_cfg=train_cfg,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        loss_iou=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    cls_scores, bbox_preds, iou_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, iou_preds, gt_bboxes,
+                                gt_labels, img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    empty_iou_loss = sum(empty_gt_losses['loss_iou'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_iou_loss.item() == 0, (
+        'there should be no iou loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, iou_preds, gt_bboxes,
+                              gt_labels, img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    onegt_iou_loss = sum(one_gt_losses['loss_iou'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+    assert onegt_iou_loss.item() > 0, 'iou loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_dense_heads_attr.py b/tests/test_models/test_dense_heads/test_dense_heads_attr.py
new file mode 100755
index 0000000..d4a57de
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_dense_heads_attr.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from terminaltables import AsciiTable
+
+from mmdet.models import dense_heads
+from mmdet.models.dense_heads import *  # noqa: F401,F403
+
+
+def test_dense_heads_test_attr():
+    """Tests inference methods such as simple_test and aug_test."""
+    # make list of dense heads
+    exceptions = ['FeatureAdaption']  # module used in head
+    all_dense_heads = [m for m in dense_heads.__all__ if m not in exceptions]
+
+    # search attributes
+    check_attributes = [
+        'simple_test', 'aug_test', 'simple_test_bboxes', 'simple_test_rpn',
+        'aug_test_rpn'
+    ]
+    table_header = ['head name'] + check_attributes
+    table_data = [table_header]
+    not_found = {k: [] for k in check_attributes}
+    for target_head_name in all_dense_heads:
+        target_head = globals()[target_head_name]
+        target_head_attributes = dir(target_head)
+        check_results = [target_head_name]
+        for check_attribute in check_attributes:
+            found = check_attribute in target_head_attributes
+            check_results.append(found)
+            if not found:
+                not_found[check_attribute].append(target_head_name)
+        table_data.append(check_results)
+    table = AsciiTable(table_data)
+    print()
+    print(table.table)
+
+    # NOTE: this test just checks attributes.
+    # simple_test of RPN heads will not work now.
+    assert len(not_found['simple_test']) == 0, \
+        f'simple_test not found in {not_found["simple_test"]}'
+    if len(not_found['aug_test']) != 0:
+        warnings.warn(f'aug_test not found in {not_found["aug_test"]}. '
+                      'Please implement it or raise NotImplementedError.')
diff --git a/tests/test_models/test_dense_heads/test_detr_head.py b/tests/test_models/test_dense_heads/test_detr_head.py
new file mode 100755
index 0000000..cc2da23
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_detr_head.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv import ConfigDict
+
+from mmdet.models.dense_heads import DETRHead
+
+
+def test_detr_head_loss():
+    """Tests transformer head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3),
+        'batch_input_shape': (s, s)
+    }]
+    config = ConfigDict(
+        dict(
+            type='DETRHead',
+            num_classes=80,
+            in_channels=200,
+            transformer=dict(
+                type='Transformer',
+                encoder=dict(
+                    type='DetrTransformerEncoder',
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=[
+                            dict(
+                                type='MultiheadAttention',
+                                embed_dims=256,
+                                num_heads=8,
+                                dropout=0.1)
+                        ],
+                        feedforward_channels=2048,
+                        ffn_dropout=0.1,
+                        operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+                decoder=dict(
+                    type='DetrTransformerDecoder',
+                    return_intermediate=True,
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='DetrTransformerDecoderLayer',
+                        attn_cfgs=dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        feedforward_channels=2048,
+                        ffn_dropout=0.1,
+                        operation_order=('self_attn', 'norm', 'cross_attn',
+                                         'norm', 'ffn', 'norm')),
+                )),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                bg_cls_weight=0.1,
+                use_sigmoid=False,
+                loss_weight=1.0,
+                class_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+            loss_iou=dict(type='GIoULoss', loss_weight=2.0)))
+
+    self = DETRHead(**config)
+    self.init_weights()
+    feat = [torch.rand(1, 200, 10, 10)]
+    cls_scores, bbox_preds = self.forward(feat, img_metas)
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    for key, loss in empty_gt_losses.items():
+        if 'cls' in key:
+            assert loss.item() > 0, 'cls loss should be non-zero'
+        elif 'bbox' in key:
+            assert loss.item(
+            ) == 0, 'there should be no box loss when there are no true boxes'
+        elif 'iou' in key:
+            assert loss.item(
+            ) == 0, 'there should be no iou loss when there are no true boxes'
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    for loss in one_gt_losses.values():
+        assert loss.item(
+        ) > 0, 'cls loss, or box loss, or iou loss should be non-zero'
+
+    # test forward_train
+    self.forward_train(feat, img_metas, gt_bboxes, gt_labels)
+
+    # test inference mode
+    self.get_bboxes(cls_scores, bbox_preds, img_metas, rescale=True)
diff --git a/tests/test_models/test_dense_heads/test_fcos_head.py b/tests/test_models/test_dense_heads/test_fcos_head.py
new file mode 100755
index 0000000..5fbe14f
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_fcos_head.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import FCOSHead
+
+
+def test_fcos_head_loss():
+    """Tests fcos head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    # since Focal Loss is not supported on CPU
+    self = FCOSHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    cls_scores, bbox_preds, centerness = self.forward(feat)
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, centerness, gt_bboxes,
+                                gt_labels, img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    empty_box_loss = empty_gt_losses['loss_bbox']
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, centerness, gt_bboxes,
+                              gt_labels, img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    onegt_box_loss = one_gt_losses['loss_bbox']
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_fsaf_head.py b/tests/test_models/test_dense_heads/test_fsaf_head.py
new file mode 100755
index 0000000..7851055
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_fsaf_head.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import FSAFHead
+
+
+def test_fsaf_head_loss():
+    """Tests anchor head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    cfg = dict(
+        reg_decoded_bbox=True,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=1,
+            scales_per_octave=1,
+            ratios=[1.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(type='TBLRBBoxCoder', normalizer=4.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0,
+            reduction='none'),
+        loss_bbox=dict(
+            type='IoULoss', eps=1e-6, loss_weight=1.0, reduction='none'))
+
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='CenterRegionAssigner',
+                pos_scale=0.2,
+                neg_scale=0.2,
+                min_pos_iof=0.01),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    head = FSAFHead(num_classes=4, in_channels=1, train_cfg=train_cfg, **cfg)
+    if torch.cuda.is_available():
+        head.cuda()
+        # FSAF head expects a multiple levels of features per image
+        feat = [
+            torch.rand(1, 1, s // (2**(i + 2)), s // (2**(i + 2))).cuda()
+            for i in range(len(head.anchor_generator.strides))
+        ]
+        cls_scores, bbox_preds = head.forward(feat)
+        gt_bboxes_ignore = None
+
+        # When truth is non-empty then both cls and box loss should be nonzero
+        #  for random inputs
+        gt_bboxes = [
+            torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]).cuda(),
+        ]
+        gt_labels = [torch.LongTensor([2]).cuda()]
+        one_gt_losses = head.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                  img_metas, gt_bboxes_ignore)
+        onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+        onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+        assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+
+        # Test that empty ground truth encourages the network to predict bkg
+        gt_bboxes = [torch.empty((0, 4)).cuda()]
+        gt_labels = [torch.LongTensor([]).cuda()]
+
+        empty_gt_losses = head.loss(cls_scores, bbox_preds, gt_bboxes,
+                                    gt_labels, img_metas, gt_bboxes_ignore)
+        # When there is no truth, the cls loss should be nonzero but there
+        # should be no box loss.
+        empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+        empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+        assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert empty_box_loss.item() == 0, (
+            'there should be no box loss when there are no true boxes')
diff --git a/tests/test_models/test_dense_heads/test_ga_anchor_head.py b/tests/test_models/test_dense_heads/test_ga_anchor_head.py
new file mode 100755
index 0000000..374f71b
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_ga_anchor_head.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import GuidedAnchorHead
+
+
+def test_ga_anchor_head_loss():
+    """Tests anchor head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5,
+            pos_weight=-1,
+            debug=False))
+    head = GuidedAnchorHead(num_classes=4, in_channels=4, train_cfg=cfg)
+
+    # Anchor head expects a multiple levels of features per image
+    if torch.cuda.is_available():
+        head.cuda()
+        feat = [
+            torch.rand(1, 4, s // (2**(i + 2)), s // (2**(i + 2))).cuda()
+            for i in range(len(head.approx_anchor_generator.base_anchors))
+        ]
+        cls_scores, bbox_preds, shape_preds, loc_preds = head.forward(feat)
+
+        # Test that empty ground truth encourages the network to predict
+        # background
+        gt_bboxes = [torch.empty((0, 4)).cuda()]
+        gt_labels = [torch.LongTensor([]).cuda()]
+
+        gt_bboxes_ignore = None
+
+        empty_gt_losses = head.loss(cls_scores, bbox_preds, shape_preds,
+                                    loc_preds, gt_bboxes, gt_labels, img_metas,
+                                    gt_bboxes_ignore)
+
+        # When there is no truth, the cls loss should be nonzero but there
+        # should be no box loss.
+        empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+        empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+        assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert empty_box_loss.item() == 0, (
+            'there should be no box loss when there are no true boxes')
+
+        # When truth is non-empty then both cls and box loss should be nonzero
+        # for random inputs
+        gt_bboxes = [
+            torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]).cuda(),
+        ]
+        gt_labels = [torch.LongTensor([2]).cuda()]
+        one_gt_losses = head.loss(cls_scores, bbox_preds, shape_preds,
+                                  loc_preds, gt_bboxes, gt_labels, img_metas,
+                                  gt_bboxes_ignore)
+        onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+        onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+        assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_gfl_head.py b/tests/test_models/test_dense_heads/test_gfl_head.py
new file mode 100755
index 0000000..6c522fa
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_gfl_head.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import GFLHead
+
+
+def test_gfl_head_loss():
+    """Tests gfl head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    self = GFLHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    empty_dfl_loss = sum(empty_gt_losses['loss_dfl'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_dfl_loss.item() == 0, (
+        'there should be no dfl loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    onegt_dfl_loss = sum(one_gt_losses['loss_dfl'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+    assert onegt_dfl_loss.item() > 0, 'dfl loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_lad_head.py b/tests/test_models/test_dense_heads/test_lad_head.py
new file mode 100755
index 0000000..0ca45f4
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_lad_head.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import torch
+
+from mmdet.models.dense_heads import LADHead, lad_head
+from mmdet.models.dense_heads.lad_head import levels_to_images
+
+
+def test_lad_head_loss():
+    """Tests lad head loss when truth is empty and non-empty."""
+
+    class mock_skm:
+
+        def GaussianMixture(self, *args, **kwargs):
+            return self
+
+        def fit(self, loss):
+            pass
+
+        def predict(self, loss):
+            components = np.zeros_like(loss, dtype=np.long)
+            return components.reshape(-1)
+
+        def score_samples(self, loss):
+            scores = np.random.random(len(loss))
+            return scores
+
+    lad_head.skm = mock_skm()
+
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.1,
+                neg_iou_thr=0.1,
+                min_pos_iou=0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    # since Focal Loss is not supported on CPU
+    self = LADHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5))
+    teacher_model = LADHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    self.init_weights()
+    teacher_model.init_weights()
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+
+    outs_teacher = teacher_model(feat)
+    label_assignment_results = teacher_model.get_label_assignment(
+        *outs_teacher, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore)
+
+    outs = teacher_model(feat)
+    empty_gt_losses = self.loss(*outs, gt_bboxes, gt_labels, img_metas,
+                                gt_bboxes_ignore, label_assignment_results)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    empty_box_loss = empty_gt_losses['loss_bbox']
+    empty_iou_loss = empty_gt_losses['loss_iou']
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_iou_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+
+    label_assignment_results = teacher_model.get_label_assignment(
+        *outs_teacher, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore)
+
+    one_gt_losses = self.loss(*outs, gt_bboxes, gt_labels, img_metas,
+                              gt_bboxes_ignore, label_assignment_results)
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    onegt_box_loss = one_gt_losses['loss_bbox']
+    onegt_iou_loss = one_gt_losses['loss_iou']
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+    assert onegt_iou_loss.item() > 0, 'box loss should be non-zero'
+    n, c, h, w = 10, 4, 20, 20
+    mlvl_tensor = [torch.ones(n, c, h, w) for i in range(5)]
+    results = levels_to_images(mlvl_tensor)
+    assert len(results) == n
+    assert results[0].size() == (h * w * 5, c)
+    assert self.with_score_voting
+
+    self = LADHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5))
+    cls_scores = [torch.ones(2, 4, 5, 5)]
+    bbox_preds = [torch.ones(2, 4, 5, 5)]
+    iou_preds = [torch.ones(2, 1, 5, 5)]
+    cfg = mmcv.Config(
+        dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=100))
+    rescale = False
+    self.get_bboxes(
+        cls_scores, bbox_preds, iou_preds, img_metas, cfg, rescale=rescale)
diff --git a/tests/test_models/test_dense_heads/test_ld_head.py b/tests/test_models/test_dense_heads/test_ld_head.py
new file mode 100755
index 0000000..017135d
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_ld_head.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import GFLHead, LDHead
+
+
+def test_ld_head_loss():
+    """Tests vfnet head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(type='ATSSAssigner', topk=9, ignore_iof_thr=0.1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+
+    self = LDHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_ld=dict(type='KnowledgeDistillationKLDivLoss', loss_weight=1.0),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]))
+
+    teacher_model = GFLHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]))
+
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+    rand_soft_target = teacher_model.forward(feat)[1]
+
+    # Test that empty ground truth encourages the network to predict
+    # background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                rand_soft_target, img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero, ld loss should
+    # be non-negative but there should be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    empty_ld_loss = sum(empty_gt_losses['loss_ld'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_ld_loss.item() >= 0, 'ld loss should be non-negative'
+
+    # When truth is non-empty then both cls and box loss should be nonzero
+    # for random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              rand_soft_target, img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+
+    gt_bboxes_ignore = gt_bboxes
+
+    # When truth is non-empty but ignored then the cls loss should be nonzero,
+    # but there should be no box loss.
+    ignore_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                 rand_soft_target, img_metas, gt_bboxes_ignore)
+    ignore_cls_loss = sum(ignore_gt_losses['loss_cls'])
+    ignore_box_loss = sum(ignore_gt_losses['loss_bbox'])
+
+    assert ignore_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert ignore_box_loss.item() == 0, 'gt bbox ignored loss should be zero'
+
+    # When truth is non-empty and not ignored then both cls and box loss should
+    # be nonzero for random inputs
+    gt_bboxes_ignore = [torch.randn(1, 4)]
+
+    not_ignore_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes,
+                                     gt_labels, rand_soft_target, img_metas,
+                                     gt_bboxes_ignore)
+    not_ignore_cls_loss = sum(not_ignore_gt_losses['loss_cls'])
+    not_ignore_box_loss = sum(not_ignore_gt_losses['loss_bbox'])
+
+    assert not_ignore_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert not_ignore_box_loss.item(
+    ) > 0, 'gt bbox not ignored loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_mask2former_head.py b/tests/test_models/test_dense_heads/test_mask2former_head.py
new file mode 100755
index 0000000..596a325
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_mask2former_head.py
@@ -0,0 +1,235 @@
+import numpy as np
+import pytest
+import torch
+from mmcv import ConfigDict
+
+from mmdet.core.mask import BitmapMasks
+from mmdet.models.dense_heads import Mask2FormerHead
+
+
+@pytest.mark.parametrize('num_stuff_classes, \
+     label_num', [(53, 100), (0, 80)])
+def test_mask2former_head_loss(num_stuff_classes, label_num):
+    """Tests head loss when truth is empty and non-empty.
+
+    Tests head loss as Panoptic Segmentation and Instance Segmentation. Tests
+    forward_train and simple_test with masks and None as gt_semantic_seg
+    """
+    self = _init_model(num_stuff_classes)
+    img_metas = [{
+        'batch_input_shape': (128, 160),
+        'pad_shape': (128, 160, 3),
+        'img_shape': (126, 160, 3),
+        'ori_shape': (63, 80, 3)
+    }, {
+        'batch_input_shape': (128, 160),
+        'pad_shape': (128, 160, 3),
+        'img_shape': (120, 160, 3),
+        'ori_shape': (60, 80, 3)
+    }]
+    feats = [
+        torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    all_cls_scores, all_mask_preds = self.forward(feats, img_metas)
+    # Test that empty ground truth encourages the network to predict background
+    gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])]
+    gt_masks_list = [
+        torch.zeros((0, 128, 160)).long(),
+        torch.zeros((0, 128, 160)).long()
+    ]
+
+    empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                                gt_masks_list, img_metas)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no mask loss.
+    for key, loss in empty_gt_losses.items():
+        if 'cls' in key:
+            assert loss.item() > 0, 'cls loss should be non-zero'
+        elif 'mask' in key:
+            assert loss.item(
+            ) == 0, 'there should be no mask loss when there are no true mask'
+        elif 'dice' in key:
+            assert loss.item(
+            ) == 0, 'there should be no dice loss when there are no true mask'
+
+    # when truth is non-empty then both cls, mask, dice loss should be nonzero
+    # random inputs
+    gt_labels_list = [
+        torch.tensor([10, label_num]).long(),
+        torch.tensor([label_num, 10]).long()
+    ]
+    mask1 = torch.zeros((2, 128, 160)).long()
+    mask1[0, :50] = 1
+    mask1[1, 50:] = 1
+    mask2 = torch.zeros((2, 128, 160)).long()
+    mask2[0, :, :50] = 1
+    mask2[1, :, 50:] = 1
+    gt_masks_list = [mask1, mask2]
+    two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                              gt_masks_list, img_metas)
+    for loss in two_gt_losses.values():
+        assert loss.item() > 0, 'all loss should be non-zero'
+
+    # test forward_train
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask2[0, :, 50:] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+        BitmapMasks(thing_mask2, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    stuff_mask2 = torch.zeros((1, 128, 160)).long()
+    stuff_mask2[0, :, 50:] = 10
+    stuff_mask2[0, :, :50] = 100
+    gt_semantic_seg = [stuff_mask1, stuff_mask2]
+
+    self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks,
+                       gt_semantic_seg)
+
+    # test when gt_semantic_seg is None
+    gt_semantic_seg = None
+    self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks,
+                       gt_semantic_seg)
+
+    # test inference mode
+    self.simple_test(feats, img_metas)
+
+
+def _init_model(num_stuff_classes):
+    base_channels = 64
+    num_things_classes = 80
+    num_classes = num_things_classes + num_stuff_classes
+    config = ConfigDict(
+        dict(
+            type='Mask2FormerHead',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            num_queries=100,
+            num_transformer_feat_level=3,
+            pixel_decoder=dict(
+                type='MSDeformAttnPixelDecoder',
+                num_outs=3,
+                norm_cfg=dict(type='GN', num_groups=32),
+                act_cfg=dict(type='ReLU'),
+                encoder=dict(
+                    type='DetrTransformerEncoder',
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=base_channels,
+                            num_heads=8,
+                            num_levels=3,
+                            num_points=4,
+                            im2col_step=64,
+                            dropout=0.0,
+                            batch_first=False,
+                            norm_cfg=None,
+                            init_cfg=None),
+                        ffn_cfgs=dict(
+                            type='FFN',
+                            embed_dims=base_channels,
+                            feedforward_channels=base_channels * 4,
+                            num_fcs=2,
+                            ffn_drop=0.0,
+                            act_cfg=dict(type='ReLU', inplace=True)),
+                        feedforward_channels=base_channels * 4,
+                        ffn_dropout=0.0,
+                        operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                    init_cfg=None),
+                positional_encoding=dict(
+                    type='SinePositionalEncoding',
+                    num_feats=base_channels // 2,
+                    normalize=True),
+                init_cfg=None),
+            enforce_decoder_input_project=False,
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True),
+            transformer_decoder=dict(
+                type='DetrTransformerDecoder',
+                return_intermediate=True,
+                num_layers=9,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        attn_drop=0.0,
+                        proj_drop=0.0,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 8,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.0,
+                        dropout_layer=None,
+                        add_identity=True),
+                    # the following parameter was not used,
+                    # just make current api happy
+                    feedforward_channels=base_channels * 8,
+                    operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                     'ffn', 'norm')),
+                init_cfg=None),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=2.0,
+                reduction='mean',
+                class_weight=[1.0] * num_classes + [0.1]),
+            loss_mask=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='mean',
+                loss_weight=5.0),
+            loss_dice=dict(
+                type='DiceLoss',
+                use_sigmoid=True,
+                activate=True,
+                reduction='mean',
+                naive_dice=True,
+                eps=1.0,
+                loss_weight=5.0),
+            train_cfg=dict(
+                num_points=256,
+                oversample_ratio=3.0,
+                importance_sample_ratio=0.75,
+                assigner=dict(
+                    type='MaskHungarianAssigner',
+                    cls_cost=dict(type='ClassificationCost', weight=2.0),
+                    mask_cost=dict(
+                        type='CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dice_cost=dict(
+                        type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
+                sampler=dict(type='MaskPseudoSampler')),
+            test_cfg=dict(
+                panoptic_on=True,
+                semantic_on=False,
+                instance_on=True,
+                max_dets_per_image=100,
+                object_mask_thr=0.8,
+                iou_thr=0.8)))
+    self = Mask2FormerHead(**config)
+    self.init_weights()
+
+    return self
diff --git a/tests/test_models/test_dense_heads/test_maskformer_head.py b/tests/test_models/test_dense_heads/test_maskformer_head.py
new file mode 100755
index 0000000..c9bebee
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_maskformer_head.py
@@ -0,0 +1,206 @@
+import numpy as np
+import torch
+from mmcv import ConfigDict
+
+from mmdet.core.mask import BitmapMasks
+from mmdet.models.dense_heads import MaskFormerHead
+
+
+def test_maskformer_head_loss():
+    """Tests head loss when truth is empty and non-empty."""
+    base_channels = 64
+    # batch_input_shape = (128, 160)
+    img_metas = [{
+        'batch_input_shape': (128, 160),
+        'pad_shape': (128, 160, 3),
+        'img_shape': (126, 160, 3),
+        'ori_shape': (63, 80, 3)
+    }, {
+        'batch_input_shape': (128, 160),
+        'pad_shape': (128, 160, 3),
+        'img_shape': (120, 160, 3),
+        'ori_shape': (60, 80, 3)
+    }]
+    feats = [
+        torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    num_things_classes = 80
+    num_stuff_classes = 53
+    num_classes = num_things_classes + num_stuff_classes
+    config = ConfigDict(
+        dict(
+            type='MaskFormerHead',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            num_queries=100,
+            pixel_decoder=dict(
+                type='TransformerEncoderPixelDecoder',
+                norm_cfg=dict(type='GN', num_groups=32),
+                act_cfg=dict(type='ReLU'),
+                encoder=dict(
+                    type='DetrTransformerEncoder',
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=dict(
+                            type='MultiheadAttention',
+                            embed_dims=base_channels,
+                            num_heads=8,
+                            attn_drop=0.1,
+                            proj_drop=0.1,
+                            dropout_layer=None,
+                            batch_first=False),
+                        ffn_cfgs=dict(
+                            embed_dims=base_channels,
+                            feedforward_channels=base_channels * 8,
+                            num_fcs=2,
+                            act_cfg=dict(type='ReLU', inplace=True),
+                            ffn_drop=0.1,
+                            dropout_layer=None,
+                            add_identity=True),
+                        operation_order=('self_attn', 'norm', 'ffn', 'norm'),
+                        norm_cfg=dict(type='LN'),
+                        init_cfg=None,
+                        batch_first=False),
+                    init_cfg=None),
+                positional_encoding=dict(
+                    type='SinePositionalEncoding',
+                    num_feats=base_channels // 2,
+                    normalize=True)),
+            enforce_decoder_input_project=False,
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True),
+            transformer_decoder=dict(
+                type='DetrTransformerDecoder',
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        attn_drop=0.1,
+                        proj_drop=0.1,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 8,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.1,
+                        dropout_layer=None,
+                        add_identity=True),
+                    # the following parameter was not used,
+                    # just make current api happy
+                    feedforward_channels=base_channels * 8,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')),
+                init_cfg=None),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0,
+                reduction='mean',
+                class_weight=[1.0] * num_classes + [0.1]),
+            loss_mask=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                reduction='mean',
+                loss_weight=20.0),
+            loss_dice=dict(
+                type='DiceLoss',
+                use_sigmoid=True,
+                activate=True,
+                reduction='mean',
+                naive_dice=True,
+                eps=1.0,
+                loss_weight=1.0),
+            train_cfg=dict(
+                assigner=dict(
+                    type='MaskHungarianAssigner',
+                    cls_cost=dict(type='ClassificationCost', weight=1.0),
+                    mask_cost=dict(
+                        type='FocalLossCost', weight=20.0, binary_input=True),
+                    dice_cost=dict(
+                        type='DiceCost', weight=1.0, pred_act=True, eps=1.0)),
+                sampler=dict(type='MaskPseudoSampler')),
+            test_cfg=dict(object_mask_thr=0.8, iou_thr=0.8)))
+    self = MaskFormerHead(**config)
+    self.init_weights()
+    all_cls_scores, all_mask_preds = self.forward(feats, img_metas)
+    # Test that empty ground truth encourages the network to predict background
+    gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])]
+    gt_masks_list = [
+        torch.zeros((0, 128, 160)).long(),
+        torch.zeros((0, 128, 160)).long()
+    ]
+
+    empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                                gt_masks_list, img_metas)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no mask loss.
+    for key, loss in empty_gt_losses.items():
+        if 'cls' in key:
+            assert loss.item() > 0, 'cls loss should be non-zero'
+        elif 'mask' in key:
+            assert loss.item(
+            ) == 0, 'there should be no mask loss when there are no true mask'
+        elif 'dice' in key:
+            assert loss.item(
+            ) == 0, 'there should be no dice loss when there are no true mask'
+
+    # when truth is non-empty then both cls, mask, dice loss should be nonzero
+    # random inputs
+    gt_labels_list = [
+        torch.tensor([10, 100]).long(),
+        torch.tensor([100, 10]).long()
+    ]
+    mask1 = torch.zeros((2, 128, 160)).long()
+    mask1[0, :50] = 1
+    mask1[1, 50:] = 1
+    mask2 = torch.zeros((2, 128, 160)).long()
+    mask2[0, :, :50] = 1
+    mask2[1, :, 50:] = 1
+    gt_masks_list = [mask1, mask2]
+    two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                              gt_masks_list, img_metas)
+    for loss in two_gt_losses.values():
+        assert loss.item() > 0, 'all loss should be non-zero'
+
+    # test forward_train
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask2[0, :, 50:] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+        BitmapMasks(thing_mask2, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    stuff_mask2 = torch.zeros((1, 128, 160)).long()
+    stuff_mask2[0, :, 50:] = 10
+    stuff_mask2[0, :, :50] = 100
+    gt_semantic_seg = [stuff_mask1, stuff_mask2]
+
+    self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks,
+                       gt_semantic_seg)
+
+    # test inference mode
+    self.simple_test(feats, img_metas)
diff --git a/tests/test_models/test_dense_heads/test_paa_head.py b/tests/test_models/test_dense_heads/test_paa_head.py
new file mode 100755
index 0000000..1aa7c6a
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_paa_head.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import torch
+
+from mmdet.models.dense_heads import PAAHead, paa_head
+from mmdet.models.dense_heads.paa_head import levels_to_images
+
+
+def test_paa_head_loss():
+    """Tests paa head loss when truth is empty and non-empty."""
+
+    class mock_skm:
+
+        def GaussianMixture(self, *args, **kwargs):
+            return self
+
+        def fit(self, loss):
+            pass
+
+        def predict(self, loss):
+            components = np.zeros_like(loss, dtype=np.long)
+            return components.reshape(-1)
+
+        def score_samples(self, loss):
+            scores = np.random.random(len(loss))
+            return scores
+
+    paa_head.skm = mock_skm()
+
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.1,
+                neg_iou_thr=0.1,
+                min_pos_iou=0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    # since Focal Loss is not supported on CPU
+    self = PAAHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    self.init_weights()
+    cls_scores, bbox_preds, iou_preds = self(feat)
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, iou_preds, gt_bboxes,
+                                gt_labels, img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    empty_box_loss = empty_gt_losses['loss_bbox']
+    empty_iou_loss = empty_gt_losses['loss_iou']
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_iou_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, iou_preds, gt_bboxes,
+                              gt_labels, img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    onegt_box_loss = one_gt_losses['loss_bbox']
+    onegt_iou_loss = one_gt_losses['loss_iou']
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+    assert onegt_iou_loss.item() > 0, 'box loss should be non-zero'
+    n, c, h, w = 10, 4, 20, 20
+    mlvl_tensor = [torch.ones(n, c, h, w) for i in range(5)]
+    results = levels_to_images(mlvl_tensor)
+    assert len(results) == n
+    assert results[0].size() == (h * w * 5, c)
+    assert self.with_score_voting
+
+    self = PAAHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5))
+    cls_scores = [torch.ones(2, 4, 5, 5)]
+    bbox_preds = [torch.ones(2, 4, 5, 5)]
+    iou_preds = [torch.ones(2, 1, 5, 5)]
+    cfg = mmcv.Config(
+        dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=100))
+    rescale = False
+    self.get_bboxes(
+        cls_scores, bbox_preds, iou_preds, img_metas, cfg, rescale=rescale)
diff --git a/tests/test_models/test_dense_heads/test_pisa_head.py b/tests/test_models/test_dense_heads/test_pisa_head.py
new file mode 100755
index 0000000..996320a
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_pisa_head.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import PISARetinaHead, PISASSDHead
+from mmdet.models.roi_heads import PISARoIHead
+
+
+def test_pisa_retinanet_head_loss():
+    """Tests pisa retinanet head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            isr=dict(k=2., bias=0.),
+            carl=dict(k=1., bias=0.2),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False))
+    self = PISARetinaHead(num_classes=4, in_channels=1, train_cfg=cfg)
+
+    # Anchor head expects a multiple levels of features per image
+    feat = [
+        torch.rand(1, 1, s // (2**(i + 2)), s // (2**(i + 2)))
+        for i in range(len(self.anchor_generator.strides))
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls'].sum()
+    empty_box_loss = empty_gt_losses['loss_bbox'].sum()
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = one_gt_losses['loss_cls'].sum()
+    onegt_box_loss = one_gt_losses['loss_bbox'].sum()
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+
+
+def test_pisa_ssd_head_loss():
+    """Tests pisa ssd head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.,
+                ignore_iof_thr=-1,
+                gt_max_assign_all=False),
+            isr=dict(k=2., bias=0.),
+            carl=dict(k=1., bias=0.2),
+            smoothl1_beta=1.,
+            allowed_border=-1,
+            pos_weight=-1,
+            neg_pos_ratio=3,
+            debug=False))
+    ssd_anchor_generator = dict(
+        type='SSDAnchorGenerator',
+        scale_major=False,
+        input_size=300,
+        strides=[1],
+        ratios=([2], ),
+        basesize_ratio_range=(0.15, 0.9))
+    self = PISASSDHead(
+        num_classes=4,
+        in_channels=(1, ),
+        train_cfg=cfg,
+        anchor_generator=ssd_anchor_generator)
+
+    # Anchor head expects a multiple levels of features per image
+    feat = [
+        torch.rand(1, 1, s // (2**(i + 2)), s // (2**(i + 2)))
+        for i in range(len(self.anchor_generator.strides))
+    ]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    # SSD is special, #pos:#neg = 1: 3, so empth gt will also lead loss cls = 0
+    assert empty_cls_loss.item() == 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+    onegt_box_loss = sum(one_gt_losses['loss_bbox'])
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+
+
+def test_pisa_roi_head_loss():
+    """Tests pisa roi head loss when truth is empty and non-empty."""
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=4,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2., bias=0.),
+            carl=dict(k=1., bias=0.2),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False))
+
+    bbox_roi_extractor = dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+        out_channels=1,
+        featmap_strides=[1])
+
+    bbox_head = dict(
+        type='Shared2FCBBoxHead',
+        in_channels=1,
+        fc_out_channels=2,
+        roi_feat_size=7,
+        num_classes=4,
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0., 0., 0., 0.],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        reg_class_agnostic=False,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0))
+
+    self = PISARoIHead(bbox_roi_extractor, bbox_head, train_cfg=train_cfg)
+
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    # Anchor head expects a multiple levels of features per image
+    feat = [
+        torch.rand(1, 1, s // (2**(i + 2)), s // (2**(i + 2)))
+        for i in range(1)
+    ]
+
+    proposal_list = [
+        torch.Tensor([[22.6667, 22.8757, 238.6326, 151.8874], [0, 3, 5, 7]])
+    ]
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+
+    empty_gt_losses = self.forward_train(feat, img_metas, proposal_list,
+                                         gt_bboxes, gt_labels,
+                                         gt_bboxes_ignore)
+
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls'].sum()
+    empty_box_loss = empty_gt_losses['loss_bbox'].sum()
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+
+    one_gt_losses = self.forward_train(feat, img_metas, proposal_list,
+                                       gt_bboxes, gt_labels, gt_bboxes_ignore)
+    onegt_cls_loss = one_gt_losses['loss_cls'].sum()
+    onegt_box_loss = one_gt_losses['loss_bbox'].sum()
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_sabl_retina_head.py b/tests/test_models/test_dense_heads/test_sabl_retina_head.py
new file mode 100755
index 0000000..4e89d9a
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_sabl_retina_head.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import SABLRetinaHead
+
+
+def test_sabl_retina_head_loss():
+    """Tests anchor head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+
+    cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    head = SABLRetinaHead(
+        num_classes=4,
+        in_channels=3,
+        feat_channels=10,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        train_cfg=cfg)
+    if torch.cuda.is_available():
+        head.cuda()
+        # Anchor head expects a multiple levels of features per image
+        feat = [
+            torch.rand(1, 3, s // (2**(i + 2)), s // (2**(i + 2))).cuda()
+            for i in range(len(head.approx_anchor_generator.base_anchors))
+        ]
+        cls_scores, bbox_preds = head.forward(feat)
+
+        # Test that empty ground truth encourages the network
+        # to predict background
+        gt_bboxes = [torch.empty((0, 4)).cuda()]
+        gt_labels = [torch.LongTensor([]).cuda()]
+
+        gt_bboxes_ignore = None
+        empty_gt_losses = head.loss(cls_scores, bbox_preds, gt_bboxes,
+                                    gt_labels, img_metas, gt_bboxes_ignore)
+        # When there is no truth, the cls loss should be nonzero but there
+        # should be no box loss.
+        empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+        empty_box_cls_loss = sum(empty_gt_losses['loss_bbox_cls'])
+        empty_box_reg_loss = sum(empty_gt_losses['loss_bbox_reg'])
+        assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert empty_box_cls_loss.item() == 0, (
+            'there should be no box cls loss when there are no true boxes')
+        assert empty_box_reg_loss.item() == 0, (
+            'there should be no box reg loss when there are no true boxes')
+
+        # When truth is non-empty then both cls and box loss should
+        # be nonzero for random inputs
+        gt_bboxes = [
+            torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]).cuda(),
+        ]
+        gt_labels = [torch.LongTensor([2]).cuda()]
+        one_gt_losses = head.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                  img_metas, gt_bboxes_ignore)
+        onegt_cls_loss = sum(one_gt_losses['loss_cls'])
+        onegt_box_cls_loss = sum(one_gt_losses['loss_bbox_cls'])
+        onegt_box_reg_loss = sum(one_gt_losses['loss_bbox_reg'])
+        assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert onegt_box_cls_loss.item() > 0, 'box loss cls should be non-zero'
+        assert onegt_box_reg_loss.item() > 0, 'box loss reg should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_solo_head.py b/tests/test_models/test_dense_heads/test_solo_head.py
new file mode 100755
index 0000000..16cb4f7
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_solo_head.py
@@ -0,0 +1,284 @@
+import pytest
+import torch
+
+from mmdet.models.dense_heads import (DecoupledSOLOHead,
+                                      DecoupledSOLOLightHead, SOLOHead)
+
+
+def test_solo_head_loss():
+    """Tests solo head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    self = SOLOHead(
+        num_classes=4,
+        in_channels=1,
+        num_grids=[40, 36, 24, 16, 12],
+        loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    mask_preds, cls_preds = self.forward(feat)
+    # Test that empty ground truth encourages the network to
+    # predict background.
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_masks = [torch.empty((0, 550, 550))]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(
+        mask_preds,
+        cls_preds,
+        gt_labels,
+        gt_masks,
+        img_metas,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_mask_loss = empty_gt_losses['loss_mask']
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_mask_loss.item() == 0, (
+        'there should be no mask loss when there are no true masks')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs.
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    gt_masks = [(torch.rand((1, 256, 256)) > 0.5).float()]
+    one_gt_losses = self.loss(
+        mask_preds,
+        cls_preds,
+        gt_labels,
+        gt_masks,
+        img_metas,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    onegt_mask_loss = one_gt_losses['loss_mask']
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_mask_loss.item() > 0, 'mask loss should be non-zero'
+
+    # When the length of num_grids, scale_ranges, and num_levels are not equal.
+    with pytest.raises(AssertionError):
+        SOLOHead(
+            num_classes=4,
+            in_channels=1,
+            num_grids=[36, 24, 16, 12],
+            loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+            loss_cls=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0))
+
+    # When input feature length is not equal to num_levels.
+    with pytest.raises(AssertionError):
+        feat = [
+            torch.rand(1, 1, s // feat_size, s // feat_size)
+            for feat_size in [4, 8, 16, 32]
+        ]
+        self.forward(feat)
+
+
+def test_desolo_head_loss():
+    """Tests solo head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    self = DecoupledSOLOHead(
+        num_classes=4,
+        in_channels=1,
+        num_grids=[40, 36, 24, 16, 12],
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    mask_preds_x, mask_preds_y, cls_preds = self.forward(feat)
+    # Test that empty ground truth encourages the network to
+    # predict background.
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_masks = [torch.empty((0, 550, 550))]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(
+        mask_preds_x,
+        mask_preds_y,
+        cls_preds,
+        gt_labels,
+        gt_masks,
+        img_metas,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_mask_loss = empty_gt_losses['loss_mask']
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_mask_loss.item() == 0, (
+        'there should be no mask loss when there are no true masks')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs.
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    gt_masks = [(torch.rand((1, 256, 256)) > 0.5).float()]
+    one_gt_losses = self.loss(
+        mask_preds_x,
+        mask_preds_y,
+        cls_preds,
+        gt_labels,
+        gt_masks,
+        img_metas,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    onegt_mask_loss = one_gt_losses['loss_mask']
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_mask_loss.item() > 0, 'mask loss should be non-zero'
+
+    # When the length of num_grids, scale_ranges, and num_levels are not equal.
+    with pytest.raises(AssertionError):
+        DecoupledSOLOHead(
+            num_classes=4,
+            in_channels=1,
+            num_grids=[36, 24, 16, 12],
+            loss_mask=dict(
+                type='DiceLoss',
+                use_sigmoid=True,
+                activate=False,
+                loss_weight=3.0),
+            loss_cls=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0))
+
+    # When input feature length is not equal to num_levels.
+    with pytest.raises(AssertionError):
+        feat = [
+            torch.rand(1, 1, s // feat_size, s // feat_size)
+            for feat_size in [4, 8, 16, 32]
+        ]
+        self.forward(feat)
+
+
+def test_desolo_light_head_loss():
+    """Tests solo head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    self = DecoupledSOLOLightHead(
+        num_classes=4,
+        in_channels=1,
+        num_grids=[40, 36, 24, 16, 12],
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0))
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    mask_preds_x, mask_preds_y, cls_preds = self.forward(feat)
+    # Test that empty ground truth encourages the network to
+    # predict background.
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_masks = [torch.empty((0, 550, 550))]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(
+        mask_preds_x,
+        mask_preds_y,
+        cls_preds,
+        gt_labels,
+        gt_masks,
+        img_metas,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_mask_loss = empty_gt_losses['loss_mask']
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_mask_loss.item() == 0, (
+        'there should be no mask loss when there are no true masks')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs.
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    gt_masks = [(torch.rand((1, 256, 256)) > 0.5).float()]
+    one_gt_losses = self.loss(
+        mask_preds_x,
+        mask_preds_y,
+        cls_preds,
+        gt_labels,
+        gt_masks,
+        img_metas,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    onegt_mask_loss = one_gt_losses['loss_mask']
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_mask_loss.item() > 0, 'mask loss should be non-zero'
+
+    # When the length of num_grids, scale_ranges, and num_levels are not equal.
+    with pytest.raises(AssertionError):
+        DecoupledSOLOLightHead(
+            num_classes=4,
+            in_channels=1,
+            num_grids=[36, 24, 16, 12],
+            loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+            loss_cls=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0))
+
+    # When input feature length is not equal to num_levels.
+    with pytest.raises(AssertionError):
+        feat = [
+            torch.rand(1, 1, s // feat_size, s // feat_size)
+            for feat_size in [4, 8, 16, 32]
+        ]
+        self.forward(feat)
diff --git a/tests/test_models/test_dense_heads/test_tood_head.py b/tests/test_models/test_dense_heads/test_tood_head.py
new file mode 100755
index 0000000..f96364d
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_tood_head.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import TOODHead
+
+
+def test_tood_head_loss():
+    """Tests paa head loss when truth is empty and non-empty."""
+
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            initial_epoch=4,
+            initial_assigner=dict(type='ATSSAssigner', topk=9),
+            assigner=dict(type='TaskAlignedAssigner', topk=13),
+            alpha=1,
+            beta=6,
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    test_cfg = mmcv.Config(
+        dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=100))
+    # since Focal Loss is not supported on CPU
+    self = TOODHead(
+        num_classes=80,
+        in_channels=1,
+        stacked_convs=6,
+        feat_channels=256,
+        anchor_type='anchor_free',
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        initial_loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        train_cfg=train_cfg,
+        test_cfg=test_cfg)
+    self.init_weights()
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [8, 16, 32, 64, 128]
+    ]
+    cls_scores, bbox_preds = self(feat)
+
+    # test initial assigner and losses
+    self.epoch = 0
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    empty_box_loss = empty_gt_losses['loss_bbox']
+    assert sum(empty_cls_loss).item() > 0, 'cls loss should be non-zero'
+    assert sum(empty_box_loss).item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    onegt_box_loss = one_gt_losses['loss_bbox']
+    assert sum(onegt_cls_loss).item() > 0, 'cls loss should be non-zero'
+    assert sum(onegt_box_loss).item() > 0, 'box loss should be non-zero'
+
+    # test task alignment assigner and losses
+    self.epoch = 10
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    empty_box_loss = empty_gt_losses['loss_bbox']
+    assert sum(empty_cls_loss).item() > 0, 'cls loss should be non-zero'
+    assert sum(empty_box_loss).item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    onegt_box_loss = one_gt_losses['loss_bbox']
+    assert sum(onegt_cls_loss).item() > 0, 'cls loss should be non-zero'
+    assert sum(onegt_box_loss).item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_vfnet_head.py b/tests/test_models/test_dense_heads/test_vfnet_head.py
new file mode 100755
index 0000000..7fec4e5
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_vfnet_head.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import VFNetHead
+
+
+def test_vfnet_head_loss():
+    """Tests vfnet head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    # since Focal Loss is not supported on CPU
+    self = VFNetHead(
+        num_classes=4,
+        in_channels=1,
+        train_cfg=train_cfg,
+        loss_cls=dict(type='VarifocalLoss', use_sigmoid=True, loss_weight=1.0))
+    if torch.cuda.is_available():
+        self.cuda()
+        feat = [
+            torch.rand(1, 1, s // feat_size, s // feat_size).cuda()
+            for feat_size in [4, 8, 16, 32, 64]
+        ]
+        cls_scores, bbox_preds, bbox_preds_refine = self.forward(feat)
+        # Test that empty ground truth encourages the network to predict
+        # background
+        gt_bboxes = [torch.empty((0, 4)).cuda()]
+        gt_labels = [torch.LongTensor([]).cuda()]
+        gt_bboxes_ignore = None
+        empty_gt_losses = self.loss(cls_scores, bbox_preds, bbox_preds_refine,
+                                    gt_bboxes, gt_labels, img_metas,
+                                    gt_bboxes_ignore)
+        # When there is no truth, the cls loss should be nonzero but there
+        # should be no box loss.
+        empty_cls_loss = empty_gt_losses['loss_cls']
+        empty_box_loss = empty_gt_losses['loss_bbox']
+        assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert empty_box_loss.item() == 0, (
+            'there should be no box loss when there are no true boxes')
+
+        # When truth is non-empty then both cls and box loss should be nonzero
+        # for random inputs
+        gt_bboxes = [
+            torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]).cuda(),
+        ]
+        gt_labels = [torch.LongTensor([2]).cuda()]
+        one_gt_losses = self.loss(cls_scores, bbox_preds, bbox_preds_refine,
+                                  gt_bboxes, gt_labels, img_metas,
+                                  gt_bboxes_ignore)
+        onegt_cls_loss = one_gt_losses['loss_cls']
+        onegt_box_loss = one_gt_losses['loss_bbox']
+        assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+        assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_yolact_head.py b/tests/test_models/test_dense_heads/test_yolact_head.py
new file mode 100755
index 0000000..e82e0d7
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_yolact_head.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import YOLACTHead, YOLACTProtonet, YOLACTSegmHead
+
+
+def test_yolact_head_loss():
+    """Tests yolact head losses when truth is empty and non-empty."""
+    s = 550
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.,
+                ignore_iof_thr=-1,
+                gt_max_assign_all=False),
+            smoothl1_beta=1.,
+            allowed_border=-1,
+            pos_weight=-1,
+            neg_pos_ratio=3,
+            debug=False,
+            min_gt_box_wh=[4.0, 4.0]))
+    bbox_head = YOLACTHead(
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=3,
+            scales_per_octave=1,
+            base_sizes=[8, 16, 32, 64, 128],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[550.0 / x for x in [69, 35, 18, 9, 5]],
+            centers=[(550 * 0.5 / x, 550 * 0.5 / x)
+                     for x in [69, 35, 18, 9, 5]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            reduction='none',
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+        num_head_convs=1,
+        num_protos=32,
+        use_ohem=True,
+        train_cfg=train_cfg)
+    segm_head = YOLACTSegmHead(
+        in_channels=256,
+        num_classes=80,
+        loss_segm=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0))
+    mask_head = YOLACTProtonet(
+        num_classes=80,
+        in_channels=256,
+        num_protos=32,
+        max_masks_to_train=100,
+        loss_mask_weight=6.125)
+    feat = [
+        torch.rand(1, 256, feat_size, feat_size)
+        for feat_size in [69, 35, 18, 9, 5]
+    ]
+    cls_score, bbox_pred, coeff_pred = bbox_head.forward(feat)
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_masks = [torch.empty((0, 550, 550))]
+    gt_bboxes_ignore = None
+    empty_gt_losses, sampling_results = bbox_head.loss(
+        cls_score,
+        bbox_pred,
+        gt_bboxes,
+        gt_labels,
+        img_metas,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = sum(empty_gt_losses['loss_cls'])
+    empty_box_loss = sum(empty_gt_losses['loss_bbox'])
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # Test segm head and mask head
+    segm_head_outs = segm_head(feat[0])
+    empty_segm_loss = segm_head.loss(segm_head_outs, gt_masks, gt_labels)
+    mask_pred = mask_head(feat[0], coeff_pred, gt_bboxes, img_metas,
+                          sampling_results)
+    empty_mask_loss = mask_head.loss(mask_pred, gt_masks, gt_bboxes, img_metas,
+                                     sampling_results)
+    # When there is no truth, the segm and mask loss should be zero.
+    empty_segm_loss = sum(empty_segm_loss['loss_segm'])
+    empty_mask_loss = sum(empty_mask_loss['loss_mask'])
+    assert empty_segm_loss.item() == 0, (
+        'there should be no segm loss when there are no true boxes')
+    assert empty_mask_loss == 0, (
+        'there should be no mask loss when there are no true boxes')
+
+    # When truth is non-empty then cls, box, mask, segm loss should be
+    # nonzero for random inputs.
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    gt_masks = [(torch.rand((1, 550, 550)) > 0.5).float()]
+
+    one_gt_losses, sampling_results = bbox_head.loss(
+        cls_score,
+        bbox_pred,
+        gt_bboxes,
+        gt_labels,
+        img_metas,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    one_gt_cls_loss = sum(one_gt_losses['loss_cls'])
+    one_gt_box_loss = sum(one_gt_losses['loss_bbox'])
+    assert one_gt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert one_gt_box_loss.item() > 0, 'box loss should be non-zero'
+
+    one_gt_segm_loss = segm_head.loss(segm_head_outs, gt_masks, gt_labels)
+    mask_pred = mask_head(feat[0], coeff_pred, gt_bboxes, img_metas,
+                          sampling_results)
+    one_gt_mask_loss = mask_head.loss(mask_pred, gt_masks, gt_bboxes,
+                                      img_metas, sampling_results)
+    one_gt_segm_loss = sum(one_gt_segm_loss['loss_segm'])
+    one_gt_mask_loss = sum(one_gt_mask_loss['loss_mask'])
+    assert one_gt_segm_loss.item() > 0, 'segm loss should be non-zero'
+    assert one_gt_mask_loss.item() > 0, 'mask loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_yolof_head.py b/tests/test_models/test_dense_heads/test_yolof_head.py
new file mode 100755
index 0000000..9810374
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_yolof_head.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.dense_heads import YOLOFHead
+
+
+def test_yolof_head_loss():
+    """Tests yolof head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='UniformAssigner',
+                pos_ignore_thr=0.15,
+                neg_ignore_thr=0.7),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False))
+    self = YOLOFHead(
+        num_classes=4,
+        in_channels=1,
+        reg_decoded_bbox=True,
+        train_cfg=train_cfg,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2, 4, 8, 16],
+            strides=[32]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1., 1., 1., 1.],
+            add_ctr_clamp=True,
+            ctr_clamp=32),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0))
+    feat = [torch.rand(1, 1, s // 32, s // 32)]
+    cls_scores, bbox_preds = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    gt_bboxes_ignore = None
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                                img_metas, gt_bboxes_ignore)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls']
+    empty_box_loss = empty_gt_losses['loss_bbox']
+    assert empty_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                              img_metas, gt_bboxes_ignore)
+    onegt_cls_loss = one_gt_losses['loss_cls']
+    onegt_box_loss = one_gt_losses['loss_bbox']
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
diff --git a/tests/test_models/test_dense_heads/test_yolox_head.py b/tests/test_models/test_dense_heads/test_yolox_head.py
new file mode 100755
index 0000000..f82c8a0
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_yolox_head.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+
+from mmdet.models.dense_heads import YOLOXHead
+
+
+def test_yolox_head_loss():
+    """Tests yolox head loss when truth is empty and non-empty."""
+    s = 256
+    img_metas = [{
+        'img_shape': (s, s, 3),
+        'scale_factor': 1,
+        'pad_shape': (s, s, 3)
+    }]
+    train_cfg = mmcv.Config(
+        dict(
+            assigner=dict(
+                type='SimOTAAssigner',
+                center_radius=2.5,
+                candidate_topk=10,
+                iou_weight=3.0,
+                cls_weight=1.0)))
+    self = YOLOXHead(
+        num_classes=4, in_channels=1, use_depthwise=False, train_cfg=train_cfg)
+    assert not self.use_l1
+    assert isinstance(self.multi_level_cls_convs[0][0], ConvModule)
+
+    feat = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16]
+    ]
+    cls_scores, bbox_preds, objectnesses = self.forward(feat)
+
+    # Test that empty ground truth encourages the network to predict background
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, objectnesses,
+                                gt_bboxes, gt_labels, img_metas)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no box loss.
+    empty_cls_loss = empty_gt_losses['loss_cls'].sum()
+    empty_box_loss = empty_gt_losses['loss_bbox'].sum()
+    empty_obj_loss = empty_gt_losses['loss_obj'].sum()
+    assert empty_cls_loss.item() == 0, (
+        'there should be no cls loss when there are no true boxes')
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when there are no true boxes')
+    assert empty_obj_loss.item() > 0, 'objectness loss should be non-zero'
+
+    # When truth is non-empty then both cls and box loss should be nonzero for
+    # random inputs
+    self = YOLOXHead(
+        num_classes=4, in_channels=1, use_depthwise=True, train_cfg=train_cfg)
+    assert isinstance(self.multi_level_cls_convs[0][0],
+                      DepthwiseSeparableConvModule)
+    self.use_l1 = True
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    one_gt_losses = self.loss(cls_scores, bbox_preds, objectnesses, gt_bboxes,
+                              gt_labels, img_metas)
+    onegt_cls_loss = one_gt_losses['loss_cls'].sum()
+    onegt_box_loss = one_gt_losses['loss_bbox'].sum()
+    onegt_obj_loss = one_gt_losses['loss_obj'].sum()
+    onegt_l1_loss = one_gt_losses['loss_l1'].sum()
+    assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero'
+    assert onegt_box_loss.item() > 0, 'box loss should be non-zero'
+    assert onegt_obj_loss.item() > 0, 'obj loss should be non-zero'
+    assert onegt_l1_loss.item() > 0, 'l1 loss should be non-zero'
+
+    # Test groud truth out of bound
+    gt_bboxes = [torch.Tensor([[s * 4, s * 4, s * 4 + 10, s * 4 + 10]])]
+    gt_labels = [torch.LongTensor([2])]
+    empty_gt_losses = self.loss(cls_scores, bbox_preds, objectnesses,
+                                gt_bboxes, gt_labels, img_metas)
+    # When gt_bboxes out of bound, the assign results should be empty,
+    # so the cls and bbox loss should be zero.
+    empty_cls_loss = empty_gt_losses['loss_cls'].sum()
+    empty_box_loss = empty_gt_losses['loss_bbox'].sum()
+    empty_obj_loss = empty_gt_losses['loss_obj'].sum()
+    assert empty_cls_loss.item() == 0, (
+        'there should be no cls loss when gt_bboxes out of bound')
+    assert empty_box_loss.item() == 0, (
+        'there should be no box loss when gt_bboxes out of bound')
+    assert empty_obj_loss.item() > 0, 'objectness loss should be non-zero'
diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py
new file mode 100755
index 0000000..98f75b8
--- /dev/null
+++ b/tests/test_models/test_forward.py
@@ -0,0 +1,935 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""pytest tests/test_forward.py."""
+import copy
+from os.path import dirname, exists, join
+
+import numpy as np
+import pytest
+import torch
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet
+        repo_dpath = dirname(dirname(mmdet.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    from mmcv import Config
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def _get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    return model
+
+
+def _replace_r50_with_r18(model):
+    """Replace ResNet50 with ResNet18 in config."""
+    model = copy.deepcopy(model)
+    if model.backbone.type == 'ResNet':
+        model.backbone.depth = 18
+        model.backbone.base_channels = 2
+        model.neck.in_channels = [2, 4, 8, 16]
+    return model
+
+
+def test_sparse_rcnn_forward():
+    config_path = 'sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py'
+    model = _get_detector_cfg(config_path)
+    model = _replace_r50_with_r18(model)
+    model.backbone.init_cfg = None
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+    detector.init_weights()
+    input_shape = (1, 3, 100, 100)
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[5])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    # Test forward train with non-empty truth batch
+    detector.train()
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_bboxes = [item for item in gt_bboxes]
+    gt_labels = mm_inputs['gt_labels']
+    gt_labels = [item for item in gt_labels]
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+    detector.forward_dummy(imgs)
+
+    # Test forward train with an empty truth batch
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_bboxes = [item for item in gt_bboxes]
+    gt_labels = mm_inputs['gt_labels']
+    gt_labels = [item for item in gt_labels]
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in imgs]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      rescale=True,
+                                      return_loss=False)
+            batch_results.append(result)
+
+    # test empty proposal in roi_head
+    with torch.no_grad():
+        # test no proposal in the whole batch
+        detector.roi_head.simple_test([imgs[0][None, :]], torch.empty(
+            (1, 0, 4)), torch.empty((1, 100, 4)), [img_metas[0]],
+                                      torch.ones((1, 4)))
+
+
+def test_rpn_forward():
+    model = _get_detector_cfg('rpn/rpn_r50_fpn_1x_coco.py')
+    model = _replace_r50_with_r18(model)
+    model.backbone.init_cfg = None
+
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+
+    input_shape = (1, 3, 100, 100)
+    mm_inputs = _demo_mm_inputs(input_shape)
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    # Test forward train
+    gt_bboxes = mm_inputs['gt_bboxes']
+    losses = detector.forward(
+        imgs, img_metas, gt_bboxes=gt_bboxes, return_loss=True)
+    assert isinstance(losses, dict)
+
+    # Test forward test
+    with torch.no_grad():
+        img_list = [g[None, :] for g in imgs]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      return_loss=False)
+            batch_results.append(result)
+
+
+@pytest.mark.parametrize(
+    'cfg_file',
+    [
+        'reppoints/reppoints_moment_r50_fpn_1x_coco.py',
+        'retinanet/retinanet_r50_fpn_1x_coco.py',
+        'guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py',
+        'ghm/retinanet_ghm_r50_fpn_1x_coco.py',
+        'fcos/fcos_center_r50_caffe_fpn_gn-head_1x_coco.py',
+        'foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py',
+        # 'free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py',
+        # 'atss/atss_r50_fpn_1x_coco.py',  # not ready for topk
+        'yolo/yolov3_mobilenetv2_320_300e_coco.py',
+        'yolox/yolox_tiny_8x8_300e_coco.py'
+    ])
+def test_single_stage_forward_gpu(cfg_file):
+    if not torch.cuda.is_available():
+        import pytest
+        pytest.skip('test requires GPU and torch+cuda')
+
+    model = _get_detector_cfg(cfg_file)
+    model = _replace_r50_with_r18(model)
+    model.backbone.init_cfg = None
+
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+
+    input_shape = (2, 3, 128, 128)
+    mm_inputs = _demo_mm_inputs(input_shape)
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    detector = detector.cuda()
+    imgs = imgs.cuda()
+    # Test forward train
+    gt_bboxes = [b.cuda() for b in mm_inputs['gt_bboxes']]
+    gt_labels = [g.cuda() for g in mm_inputs['gt_labels']]
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in imgs]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      return_loss=False)
+            batch_results.append(result)
+
+
+def test_faster_rcnn_ohem_forward():
+    model = _get_detector_cfg(
+        'faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py')
+    model = _replace_r50_with_r18(model)
+    model.backbone.init_cfg = None
+
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+
+    input_shape = (1, 3, 100, 100)
+
+    # Test forward train with a non-empty truth batch
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward train with an empty truth batch
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test RoI forward train with an empty proposals
+    feature = detector.extract_feat(imgs[0][None, :])
+    losses = detector.roi_head.forward_train(
+        feature,
+        img_metas, [torch.empty((0, 5))],
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels)
+    assert isinstance(losses, dict)
+
+
+@pytest.mark.parametrize(
+    'cfg_file',
+    [
+        # 'cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py',
+        'mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py',
+        # 'grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py',
+        # 'ms_rcnn/ms_rcnn_r50_fpn_1x_coco.py',
+        # 'htc/htc_r50_fpn_1x_coco.py',
+        # 'panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py',
+        # 'scnet/scnet_r50_fpn_20e_coco.py',
+        # 'seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py'  # noqa: E501
+    ])
+def test_two_stage_forward(cfg_file):
+    models_with_semantic = [
+        'htc/htc_r50_fpn_1x_coco.py',
+        'panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py',
+        'scnet/scnet_r50_fpn_20e_coco.py',
+    ]
+    if cfg_file in models_with_semantic:
+        with_semantic = True
+    else:
+        with_semantic = False
+
+    model = _get_detector_cfg(cfg_file)
+    model = _replace_r50_with_r18(model)
+    model.backbone.init_cfg = None
+
+    # Save cost
+    if cfg_file in [
+            'seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.py'  # noqa: E501
+    ]:
+        model.roi_head.bbox_head.num_classes = 80
+        model.roi_head.bbox_head.loss_cls.num_classes = 80
+        model.roi_head.mask_head.num_classes = 80
+        model.test_cfg.rcnn.score_thr = 0.05
+        model.test_cfg.rcnn.max_per_img = 100
+
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+
+    input_shape = (1, 3, 128, 128)
+
+    # Test forward train with a non-empty truth batch
+    mm_inputs = _demo_mm_inputs(
+        input_shape, num_items=[10], with_semantic=with_semantic)
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    losses = detector.forward(imgs, img_metas, return_loss=True, **mm_inputs)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    loss.requires_grad_(True)
+    assert float(loss.item()) > 0
+    loss.backward()
+
+    # Test forward train with an empty truth batch
+    mm_inputs = _demo_mm_inputs(
+        input_shape, num_items=[0], with_semantic=with_semantic)
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    losses = detector.forward(imgs, img_metas, return_loss=True, **mm_inputs)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    loss.requires_grad_(True)
+    assert float(loss.item()) > 0
+    loss.backward()
+
+    # Test RoI forward train with an empty proposals
+    if cfg_file in [
+            'panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py'  # noqa: E501
+    ]:
+        mm_inputs.pop('gt_semantic_seg')
+
+    feature = detector.extract_feat(imgs[0][None, :])
+    losses = detector.roi_head.forward_train(feature, img_metas,
+                                             [torch.empty(
+                                                 (0, 5))], **mm_inputs)
+    assert isinstance(losses, dict)
+
+    # Test forward test
+    with torch.no_grad():
+        img_list = [g[None, :] for g in imgs]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      return_loss=False)
+            batch_results.append(result)
+    cascade_models = [
+        'cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py',
+        'htc/htc_r50_fpn_1x_coco.py',
+        'scnet/scnet_r50_fpn_20e_coco.py',
+    ]
+    # test empty proposal in roi_head
+    with torch.no_grad():
+        # test no proposal in the whole batch
+        detector.simple_test(
+            imgs[0][None, :], [img_metas[0]], proposals=[torch.empty((0, 4))])
+
+        # test no proposal of aug
+        features = detector.extract_feats([imgs[0][None, :]] * 2)
+        detector.roi_head.aug_test(features, [torch.empty((0, 4))] * 2,
+                                   [[img_metas[0]]] * 2)
+
+        # test rcnn_test_cfg is None
+        if cfg_file not in cascade_models:
+            feature = detector.extract_feat(imgs[0][None, :])
+            bboxes, scores = detector.roi_head.simple_test_bboxes(
+                feature, [img_metas[0]], [torch.empty((0, 4))], None)
+            assert all([bbox.shape == torch.Size((0, 4)) for bbox in bboxes])
+            assert all([
+                score.shape == torch.Size(
+                    (0, detector.roi_head.bbox_head.fc_cls.out_features))
+                for score in scores
+            ])
+
+        # test no proposal in the some image
+        x1y1 = torch.randint(1, 100, (10, 2)).float()
+        # x2y2 must be greater than x1y1
+        x2y2 = x1y1 + torch.randint(1, 100, (10, 2))
+        detector.simple_test(
+            imgs[0][None, :].repeat(2, 1, 1, 1), [img_metas[0]] * 2,
+            proposals=[torch.empty((0, 4)),
+                       torch.cat([x1y1, x2y2], dim=-1)])
+
+        # test no proposal of aug
+        detector.roi_head.aug_test(
+            features, [torch.cat([x1y1, x2y2], dim=-1),
+                       torch.empty((0, 4))], [[img_metas[0]]] * 2)
+
+        # test rcnn_test_cfg is None
+        if cfg_file not in cascade_models:
+            feature = detector.extract_feat(imgs[0][None, :].repeat(
+                2, 1, 1, 1))
+            bboxes, scores = detector.roi_head.simple_test_bboxes(
+                feature, [img_metas[0]] * 2,
+                [torch.empty((0, 4)),
+                 torch.cat([x1y1, x2y2], dim=-1)], None)
+            assert bboxes[0].shape == torch.Size((0, 4))
+            assert scores[0].shape == torch.Size(
+                (0, detector.roi_head.bbox_head.fc_cls.out_features))
+
+
+@pytest.mark.parametrize(
+    'cfg_file', ['ghm/retinanet_ghm_r50_fpn_1x_coco.py', 'ssd/ssd300_coco.py'])
+def test_single_stage_forward_cpu(cfg_file):
+    model = _get_detector_cfg(cfg_file)
+    model = _replace_r50_with_r18(model)
+    model.backbone.init_cfg = None
+
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+
+    input_shape = (1, 3, 300, 300)
+    mm_inputs = _demo_mm_inputs(input_shape)
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    # Test forward train
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in imgs]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      return_loss=False)
+            batch_results.append(result)
+
+
+def _demo_mm_inputs(input_shape=(1, 3, 300, 300),
+                    num_items=None, num_classes=10,
+                    with_semantic=False):  # yapf: disable
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+
+        num_items (None | List[int]):
+            specifies the number of boxes in each batch item
+
+        num_classes (int):
+            number of different labels a box might have
+    """
+    from mmdet.core import BitmapMasks
+
+    (N, C, H, W) = input_shape
+
+    rng = np.random.RandomState(0)
+
+    imgs = rng.rand(*input_shape)
+
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': np.array([1.1, 1.2, 1.1, 1.2]),
+        'flip': False,
+        'flip_direction': None,
+    } for _ in range(N)]
+
+    gt_bboxes = []
+    gt_labels = []
+    gt_masks = []
+
+    for batch_idx in range(N):
+        if num_items is None:
+            num_boxes = rng.randint(1, 10)
+        else:
+            num_boxes = num_items[batch_idx]
+
+        cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+        tl_x = ((cx * W) - (W * bw / 2)).clip(0, W)
+        tl_y = ((cy * H) - (H * bh / 2)).clip(0, H)
+        br_x = ((cx * W) + (W * bw / 2)).clip(0, W)
+        br_y = ((cy * H) + (H * bh / 2)).clip(0, H)
+
+        boxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+        class_idxs = rng.randint(1, num_classes, size=num_boxes)
+
+        gt_bboxes.append(torch.FloatTensor(boxes))
+        gt_labels.append(torch.LongTensor(class_idxs))
+
+    mask = np.random.randint(0, 2, (len(boxes), H, W), dtype=np.uint8)
+    gt_masks.append(BitmapMasks(mask, H, W))
+
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_bboxes': gt_bboxes,
+        'gt_labels': gt_labels,
+        'gt_bboxes_ignore': None,
+        'gt_masks': gt_masks,
+    }
+
+    if with_semantic:
+        # assume gt_semantic_seg using scale 1/8 of the img
+        gt_semantic_seg = np.random.randint(
+            0, num_classes, (1, 1, H // 8, W // 8), dtype=np.uint8)
+        mm_inputs.update(
+            {'gt_semantic_seg': torch.ByteTensor(gt_semantic_seg)})
+
+    return mm_inputs
+
+
+def test_yolact_forward():
+    model = _get_detector_cfg('yolact/yolact_r50_1x8_coco.py')
+    model = _replace_r50_with_r18(model)
+    model.backbone.init_cfg = None
+
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+
+    input_shape = (1, 3, 100, 100)
+    mm_inputs = _demo_mm_inputs(input_shape)
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    # Test forward train
+    detector.train()
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    gt_masks = mm_inputs['gt_masks']
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        gt_masks=gt_masks,
+        return_loss=True)
+    assert isinstance(losses, dict)
+
+    # Test forward dummy for get_flops
+    detector.forward_dummy(imgs)
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in imgs]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      rescale=True,
+                                      return_loss=False)
+            batch_results.append(result)
+
+
+def test_detr_forward():
+    model = _get_detector_cfg('detr/detr_r50_8x2_150e_coco.py')
+    model.backbone.depth = 18
+    model.bbox_head.in_channels = 512
+    model.backbone.init_cfg = None
+
+    from mmdet.models import build_detector
+    detector = build_detector(model)
+
+    input_shape = (1, 3, 100, 100)
+    mm_inputs = _demo_mm_inputs(input_shape)
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    # Test forward train with non-empty truth batch
+    detector.train()
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward train with an empty truth batch
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    losses = detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in imgs]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      rescale=True,
+                                      return_loss=False)
+            batch_results.append(result)
+
+
+def test_inference_detector():
+    from mmcv import ConfigDict
+
+    from mmdet.apis import inference_detector
+    from mmdet.models import build_detector
+
+    # small RetinaNet
+    num_class = 3
+    model_dict = dict(
+        type='RetinaNet',
+        backbone=dict(
+            type='ResNet',
+            depth=18,
+            num_stages=4,
+            out_indices=(3, ),
+            norm_cfg=dict(type='BN', requires_grad=False),
+            norm_eval=True,
+            style='pytorch'),
+        neck=None,
+        bbox_head=dict(
+            type='RetinaHead',
+            num_classes=num_class,
+            in_channels=512,
+            stacked_convs=1,
+            feat_channels=256,
+            anchor_generator=dict(
+                type='AnchorGenerator',
+                octave_base_scale=4,
+                scales_per_octave=3,
+                ratios=[0.5],
+                strides=[32]),
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]),
+        ),
+        test_cfg=dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100))
+
+    rng = np.random.RandomState(0)
+    img1 = rng.rand(100, 100, 3)
+    img2 = rng.rand(100, 100, 3)
+
+    model = build_detector(ConfigDict(model_dict))
+    config = _get_config_module('retinanet/retinanet_r50_fpn_1x_coco.py')
+    model.cfg = config
+    # test single image
+    result = inference_detector(model, img1)
+    assert len(result) == num_class
+    # test multiple image
+    result = inference_detector(model, [img1, img2])
+    assert len(result) == 2 and len(result[0]) == num_class
+
+
+def test_yolox_random_size():
+    from mmdet.models import build_detector
+    model = _get_detector_cfg('yolox/yolox_tiny_8x8_300e_coco.py')
+    model.random_size_range = (2, 2)
+    model.input_size = (64, 96)
+    model.random_size_interval = 1
+
+    detector = build_detector(model)
+    input_shape = (1, 3, 64, 64)
+    mm_inputs = _demo_mm_inputs(input_shape)
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    # Test forward train with non-empty truth batch
+    detector.train()
+    gt_bboxes = mm_inputs['gt_bboxes']
+    gt_labels = mm_inputs['gt_labels']
+    detector.forward(
+        imgs,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        return_loss=True)
+    assert detector._input_size == (64, 96)
+
+
+def test_maskformer_forward():
+    model_cfg = _get_detector_cfg(
+        'maskformer/maskformer_r50_mstrain_16x1_75e_coco.py')
+    base_channels = 32
+    model_cfg.backbone.depth = 18
+    model_cfg.backbone.init_cfg = None
+    model_cfg.backbone.base_channels = base_channels
+    model_cfg.panoptic_head.in_channels = [
+        base_channels * 2**i for i in range(4)
+    ]
+    model_cfg.panoptic_head.feat_channels = base_channels
+    model_cfg.panoptic_head.out_channels = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 8
+    model_cfg.panoptic_head.pixel_decoder.\
+        positional_encoding.num_feats = base_channels // 2
+    model_cfg.panoptic_head.positional_encoding.\
+        num_feats = base_channels // 2
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 8
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.feedforward_channels = base_channels * 8
+
+    from mmdet.core import BitmapMasks
+    from mmdet.models import build_detector
+    detector = build_detector(model_cfg)
+
+    # Test forward train with non-empty truth batch
+    detector.train()
+    img_metas = [
+        {
+            'batch_input_shape': (128, 160),
+            'img_shape': (126, 160, 3),
+            'ori_shape': (63, 80, 3),
+            'pad_shape': (128, 160, 3)
+        },
+    ]
+    img = torch.rand((1, 3, 128, 160))
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    gt_semantic_seg = [
+        stuff_mask1,
+    ]
+    losses = detector.forward(
+        img=img,
+        img_metas=img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        gt_masks=gt_masks,
+        gt_semantic_seg=gt_semantic_seg,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward train with an empty truth batch
+    gt_bboxes = [
+        torch.empty((0, 4)).float(),
+    ]
+    gt_labels = [
+        torch.empty((0, )).long(),
+    ]
+    mask = np.zeros((0, 128, 160), dtype=np.uint8)
+    gt_masks = [
+        BitmapMasks(mask, 128, 160),
+    ]
+    gt_semantic_seg = [
+        torch.randint(0, 133, (0, 128, 160)),
+    ]
+    losses = detector.forward(
+        img,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        gt_masks=gt_masks,
+        gt_semantic_seg=gt_semantic_seg,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in img]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      rescale=True,
+                                      return_loss=False)
+        batch_results.append(result)
+
+
+@pytest.mark.parametrize('cfg_file', [
+    'mask2former/mask2former_r50_lsj_8x2_50e_coco.py',
+    'mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py'
+])
+def test_mask2former_forward(cfg_file):
+    # Test Panoptic Segmentation and Instance Segmentation
+    model_cfg = _get_detector_cfg(cfg_file)
+    base_channels = 32
+    model_cfg.backbone.depth = 18
+    model_cfg.backbone.init_cfg = None
+    model_cfg.backbone.base_channels = base_channels
+    model_cfg.panoptic_head.in_channels = [
+        base_channels * 2**i for i in range(4)
+    ]
+    model_cfg.panoptic_head.feat_channels = base_channels
+    model_cfg.panoptic_head.out_channels = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 4
+    model_cfg.panoptic_head.pixel_decoder.\
+        positional_encoding.num_feats = base_channels // 2
+    model_cfg.panoptic_head.positional_encoding.\
+        num_feats = base_channels // 2
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 8
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.feedforward_channels = base_channels * 8
+
+    num_stuff_classes = model_cfg.panoptic_head.num_stuff_classes
+
+    from mmdet.core import BitmapMasks
+    from mmdet.models import build_detector
+    detector = build_detector(model_cfg)
+
+    def _forward_train():
+        losses = detector.forward(
+            img,
+            img_metas,
+            gt_bboxes=gt_bboxes,
+            gt_labels=gt_labels,
+            gt_masks=gt_masks,
+            gt_semantic_seg=gt_semantic_seg,
+            return_loss=True)
+        assert isinstance(losses, dict)
+        loss, _ = detector._parse_losses(losses)
+        assert float(loss.item()) > 0
+
+    # Test forward train with non-empty truth batch
+    detector.train()
+    img_metas = [
+        {
+            'batch_input_shape': (128, 160),
+            'img_shape': (126, 160, 3),
+            'ori_shape': (63, 80, 3),
+            'pad_shape': (128, 160, 3)
+        },
+    ]
+    img = torch.rand((1, 3, 128, 160))
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    gt_semantic_seg = [
+        stuff_mask1,
+    ]
+    _forward_train()
+
+    # Test forward train with non-empty truth batch and gt_semantic_seg=None
+    gt_semantic_seg = None
+    _forward_train()
+
+    # Test forward train with an empty truth batch
+    gt_bboxes = [
+        torch.empty((0, 4)).float(),
+    ]
+    gt_labels = [
+        torch.empty((0, )).long(),
+    ]
+    mask = np.zeros((0, 128, 160), dtype=np.uint8)
+    gt_masks = [
+        BitmapMasks(mask, 128, 160),
+    ]
+    gt_semantic_seg = [
+        torch.randint(0, 133, (0, 128, 160)),
+    ]
+    _forward_train()
+
+    # Test forward train with an empty truth batch and gt_semantic_seg=None
+    gt_semantic_seg = None
+    _forward_train()
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in img]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      rescale=True,
+                                      return_loss=False)
+
+            if num_stuff_classes > 0:
+                assert isinstance(result[0], dict)
+            else:
+                assert isinstance(result[0], tuple)
+
+        batch_results.append(result)
diff --git a/tests/test_models/test_loss.py b/tests/test_models/test_loss.py
new file mode 100755
index 0000000..280f3f6
--- /dev/null
+++ b/tests/test_models/test_loss.py
@@ -0,0 +1,232 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmcv.utils import digit_version
+
+from mmdet.models.losses import (BalancedL1Loss, CrossEntropyLoss, DiceLoss,
+                                 DistributionFocalLoss, FocalLoss,
+                                 GaussianFocalLoss,
+                                 KnowledgeDistillationKLDivLoss, L1Loss,
+                                 MSELoss, QualityFocalLoss, SeesawLoss,
+                                 SmoothL1Loss, VarifocalLoss)
+from mmdet.models.losses.ghm_loss import GHMC, GHMR
+from mmdet.models.losses.iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss,
+                                          GIoULoss, IoULoss)
+
+
+@pytest.mark.parametrize(
+    'loss_class', [IoULoss, BoundedIoULoss, GIoULoss, DIoULoss, CIoULoss])
+def test_iou_type_loss_zeros_weight(loss_class):
+    pred = torch.rand((10, 4))
+    target = torch.rand((10, 4))
+    weight = torch.zeros(10)
+
+    loss = loss_class()(pred, target, weight)
+    assert loss == 0.
+
+
+@pytest.mark.parametrize('loss_class', [
+    BalancedL1Loss, BoundedIoULoss, CIoULoss, CrossEntropyLoss, DIoULoss,
+    FocalLoss, DistributionFocalLoss, MSELoss, SeesawLoss, GaussianFocalLoss,
+    GIoULoss, IoULoss, L1Loss, QualityFocalLoss, VarifocalLoss, GHMR, GHMC,
+    SmoothL1Loss, KnowledgeDistillationKLDivLoss, DiceLoss
+])
+def test_loss_with_reduction_override(loss_class):
+    pred = torch.rand((10, 4))
+    target = torch.rand((10, 4)),
+    weight = None
+
+    with pytest.raises(AssertionError):
+        # only reduction_override from [None, 'none', 'mean', 'sum']
+        # is not allowed
+        reduction_override = True
+        loss_class()(
+            pred, target, weight, reduction_override=reduction_override)
+
+
+@pytest.mark.parametrize('loss_class', [
+    IoULoss, BoundedIoULoss, GIoULoss, DIoULoss, CIoULoss, MSELoss, L1Loss,
+    SmoothL1Loss, BalancedL1Loss
+])
+@pytest.mark.parametrize('input_shape', [(10, 4), (0, 4)])
+def test_regression_losses(loss_class, input_shape):
+    pred = torch.rand(input_shape)
+    target = torch.rand(input_shape)
+    weight = torch.rand(input_shape)
+
+    # Test loss forward
+    loss = loss_class()(pred, target)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with weight
+    loss = loss_class()(pred, target, weight)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with reduction_override
+    loss = loss_class()(pred, target, reduction_override='mean')
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with avg_factor
+    loss = loss_class()(pred, target, avg_factor=10)
+    assert isinstance(loss, torch.Tensor)
+
+    with pytest.raises(ValueError):
+        # loss can evaluate with avg_factor only if
+        # reduction is None, 'none' or 'mean'.
+        reduction_override = 'sum'
+        loss_class()(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+
+    # Test loss forward with avg_factor and reduction
+    for reduction_override in [None, 'none', 'mean']:
+        loss_class()(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+        assert isinstance(loss, torch.Tensor)
+
+
+@pytest.mark.parametrize('loss_class', [FocalLoss, CrossEntropyLoss])
+@pytest.mark.parametrize('input_shape', [(10, 5), (0, 5)])
+def test_classification_losses(loss_class, input_shape):
+    if input_shape[0] == 0 and digit_version(
+            torch.__version__) < digit_version('1.5.0'):
+        pytest.skip(
+            f'CELoss in PyTorch {torch.__version__} does not support empty'
+            f'tensor.')
+
+    pred = torch.rand(input_shape)
+    target = torch.randint(0, 5, (input_shape[0], ))
+
+    # Test loss forward
+    loss = loss_class()(pred, target)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with reduction_override
+    loss = loss_class()(pred, target, reduction_override='mean')
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with avg_factor
+    loss = loss_class()(pred, target, avg_factor=10)
+    assert isinstance(loss, torch.Tensor)
+
+    with pytest.raises(ValueError):
+        # loss can evaluate with avg_factor only if
+        # reduction is None, 'none' or 'mean'.
+        reduction_override = 'sum'
+        loss_class()(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+
+    # Test loss forward with avg_factor and reduction
+    for reduction_override in [None, 'none', 'mean']:
+        loss_class()(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+        assert isinstance(loss, torch.Tensor)
+
+
+@pytest.mark.parametrize('loss_class', [GHMR])
+@pytest.mark.parametrize('input_shape', [(10, 4), (0, 4)])
+def test_GHMR_loss(loss_class, input_shape):
+    pred = torch.rand(input_shape)
+    target = torch.rand(input_shape)
+    weight = torch.rand(input_shape)
+
+    # Test loss forward
+    loss = loss_class()(pred, target, weight)
+    assert isinstance(loss, torch.Tensor)
+
+
+@pytest.mark.parametrize('use_sigmoid', [True, False])
+@pytest.mark.parametrize('reduction', ['sum', 'mean', None])
+@pytest.mark.parametrize('avg_non_ignore', [True, False])
+def test_loss_with_ignore_index(use_sigmoid, reduction, avg_non_ignore):
+    # Test cross_entropy loss
+    loss_class = CrossEntropyLoss(
+        use_sigmoid=use_sigmoid,
+        use_mask=False,
+        ignore_index=255,
+        avg_non_ignore=avg_non_ignore)
+    pred = torch.rand((10, 5))
+    target = torch.randint(0, 5, (10, ))
+
+    ignored_indices = torch.randint(0, 10, (2, ), dtype=torch.long)
+    target[ignored_indices] = 255
+
+    # Test loss forward with default ignore
+    loss_with_ignore = loss_class(pred, target, reduction_override=reduction)
+    assert isinstance(loss_with_ignore, torch.Tensor)
+
+    # Test loss forward with forward ignore
+    target[ignored_indices] = 255
+    loss_with_forward_ignore = loss_class(
+        pred, target, ignore_index=255, reduction_override=reduction)
+    assert isinstance(loss_with_forward_ignore, torch.Tensor)
+
+    # Verify correctness
+    if avg_non_ignore:
+        # manually remove the ignored elements
+        not_ignored_indices = (target != 255)
+        pred = pred[not_ignored_indices]
+        target = target[not_ignored_indices]
+    loss = loss_class(pred, target, reduction_override=reduction)
+
+    assert torch.allclose(loss, loss_with_ignore)
+    assert torch.allclose(loss, loss_with_forward_ignore)
+
+    # test ignore all target
+    pred = torch.rand((10, 5))
+    target = torch.ones((10, ), dtype=torch.long) * 255
+    loss = loss_class(pred, target, reduction_override=reduction)
+    assert loss == 0
+
+
+@pytest.mark.parametrize('naive_dice', [True, False])
+def test_dice_loss(naive_dice):
+    loss_class = DiceLoss
+    pred = torch.rand((10, 4, 4))
+    target = torch.rand((10, 4, 4))
+    weight = torch.rand((10))
+
+    # Test loss forward
+    loss = loss_class(naive_dice=naive_dice)(pred, target)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with weight
+    loss = loss_class(naive_dice=naive_dice)(pred, target, weight)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with reduction_override
+    loss = loss_class(naive_dice=naive_dice)(
+        pred, target, reduction_override='mean')
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with avg_factor
+    loss = loss_class(naive_dice=naive_dice)(pred, target, avg_factor=10)
+    assert isinstance(loss, torch.Tensor)
+
+    with pytest.raises(ValueError):
+        # loss can evaluate with avg_factor only if
+        # reduction is None, 'none' or 'mean'.
+        reduction_override = 'sum'
+        loss_class(naive_dice=naive_dice)(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+
+    # Test loss forward with avg_factor and reduction
+    for reduction_override in [None, 'none', 'mean']:
+        loss_class(naive_dice=naive_dice)(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+        assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with has_acted=False and use_sigmoid=False
+    with pytest.raises(NotImplementedError):
+        loss_class(
+            use_sigmoid=False, activate=True, naive_dice=naive_dice)(pred,
+                                                                     target)
+
+    # Test loss forward with weight.ndim != loss.ndim
+    with pytest.raises(AssertionError):
+        weight = torch.rand((2, 8))
+        loss_class(naive_dice=naive_dice)(pred, target, weight)
+
+    # Test loss forward with len(weight) != len(pred)
+    with pytest.raises(AssertionError):
+        weight = torch.rand((8))
+        loss_class(naive_dice=naive_dice)(pred, target, weight)
diff --git a/tests/test_models/test_loss_compatibility.py b/tests/test_models/test_loss_compatibility.py
new file mode 100755
index 0000000..97759b8
--- /dev/null
+++ b/tests/test_models/test_loss_compatibility.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""pytest tests/test_loss_compatibility.py."""
+import copy
+from os.path import dirname, exists, join
+
+import numpy as np
+import pytest
+import torch
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet
+        repo_dpath = dirname(dirname(mmdet.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    from mmcv import Config
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def _get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    return model
+
+
+@pytest.mark.parametrize('loss_bbox', [
+    dict(type='L1Loss', loss_weight=1.0),
+    dict(type='GHMR', mu=0.02, bins=10, momentum=0.7, loss_weight=10.0),
+    dict(type='IoULoss', loss_weight=1.0),
+    dict(type='BoundedIoULoss', loss_weight=1.0),
+    dict(type='GIoULoss', loss_weight=1.0),
+    dict(type='DIoULoss', loss_weight=1.0),
+    dict(type='CIoULoss', loss_weight=1.0),
+    dict(type='MSELoss', loss_weight=1.0),
+    dict(type='SmoothL1Loss', loss_weight=1.0),
+    dict(type='BalancedL1Loss', loss_weight=1.0)
+])
+def test_bbox_loss_compatibility(loss_bbox):
+    """Test loss_bbox compatibility.
+
+    Using Faster R-CNN as a sample, modifying the loss function in the config
+    file to verify the compatibility of Loss APIS
+    """
+    # Faster R-CNN config dict
+    config_path = '_base_/models/faster_rcnn_r50_fpn.py'
+    cfg_model = _get_detector_cfg(config_path)
+
+    input_shape = (1, 3, 256, 256)
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    if 'IoULoss' in loss_bbox['type']:
+        cfg_model.roi_head.bbox_head.reg_decoded_bbox = True
+
+    cfg_model.roi_head.bbox_head.loss_bbox = loss_bbox
+
+    from mmdet.models import build_detector
+    detector = build_detector(cfg_model)
+
+    loss = detector.forward(imgs, img_metas, return_loss=True, **mm_inputs)
+    assert isinstance(loss, dict)
+    loss, _ = detector._parse_losses(loss)
+    assert float(loss.item()) > 0
+
+
+@pytest.mark.parametrize('loss_cls', [
+    dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+    dict(
+        type='FocalLoss',
+        use_sigmoid=True,
+        gamma=2.0,
+        alpha=0.25,
+        loss_weight=1.0),
+    dict(
+        type='GHMC', bins=30, momentum=0.75, use_sigmoid=True, loss_weight=1.0)
+])
+def test_cls_loss_compatibility(loss_cls):
+    """Test loss_cls compatibility.
+
+    Using Faster R-CNN as a sample, modifying the loss function in the config
+    file to verify the compatibility of Loss APIS
+    """
+    # Faster R-CNN config dict
+    config_path = '_base_/models/faster_rcnn_r50_fpn.py'
+    cfg_model = _get_detector_cfg(config_path)
+
+    input_shape = (1, 3, 256, 256)
+    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10])
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    # verify class loss function compatibility
+    # for loss_cls in loss_clses:
+    cfg_model.roi_head.bbox_head.loss_cls = loss_cls
+
+    from mmdet.models import build_detector
+    detector = build_detector(cfg_model)
+
+    loss = detector.forward(imgs, img_metas, return_loss=True, **mm_inputs)
+    assert isinstance(loss, dict)
+    loss, _ = detector._parse_losses(loss)
+    assert float(loss.item()) > 0
+
+
+def _demo_mm_inputs(input_shape=(1, 3, 300, 300),
+                    num_items=None, num_classes=10,
+                    with_semantic=False):  # yapf: disable
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+
+        num_items (None | List[int]):
+            specifies the number of boxes in each batch item
+
+        num_classes (int):
+            number of different labels a box might have
+    """
+    from mmdet.core import BitmapMasks
+
+    (N, C, H, W) = input_shape
+
+    rng = np.random.RandomState(0)
+
+    imgs = rng.rand(*input_shape)
+
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': np.array([1.1, 1.2, 1.1, 1.2]),
+        'flip': False,
+        'flip_direction': None,
+    } for _ in range(N)]
+
+    gt_bboxes = []
+    gt_labels = []
+    gt_masks = []
+
+    for batch_idx in range(N):
+        if num_items is None:
+            num_boxes = rng.randint(1, 10)
+        else:
+            num_boxes = num_items[batch_idx]
+
+        cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+        tl_x = ((cx * W) - (W * bw / 2)).clip(0, W)
+        tl_y = ((cy * H) - (H * bh / 2)).clip(0, H)
+        br_x = ((cx * W) + (W * bw / 2)).clip(0, W)
+        br_y = ((cy * H) + (H * bh / 2)).clip(0, H)
+
+        boxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+        class_idxs = rng.randint(1, num_classes, size=num_boxes)
+
+        gt_bboxes.append(torch.FloatTensor(boxes))
+        gt_labels.append(torch.LongTensor(class_idxs))
+
+    mask = np.random.randint(0, 2, (len(boxes), H, W), dtype=np.uint8)
+    gt_masks.append(BitmapMasks(mask, H, W))
+
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_bboxes': gt_bboxes,
+        'gt_labels': gt_labels,
+        'gt_bboxes_ignore': None,
+        'gt_masks': gt_masks,
+    }
+
+    if with_semantic:
+        # assume gt_semantic_seg using scale 1/8 of the img
+        gt_semantic_seg = np.random.randint(
+            0, num_classes, (1, 1, H // 8, W // 8), dtype=np.uint8)
+        mm_inputs.update(
+            {'gt_semantic_seg': torch.ByteTensor(gt_semantic_seg)})
+
+    return mm_inputs
diff --git a/tests/test_models/test_necks.py b/tests/test_models/test_necks.py
new file mode 100755
index 0000000..ff8c78d
--- /dev/null
+++ b/tests/test_models/test_necks.py
@@ -0,0 +1,673 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.models.necks import (FPG, FPN, FPN_CARAFE, NASFCOS_FPN, NASFPN,
+                                YOLOXPAFPN, ChannelMapper, CTResNetNeck,
+                                DilatedEncoder, DyHead, SSDNeck, YOLOV3Neck)
+
+
+def test_fpn():
+    """Tests fpn."""
+    s = 64
+    in_channels = [8, 16, 32, 64]
+    feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+    out_channels = 8
+
+    # end_level=-1 is equal to end_level=3
+    FPN(in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=0,
+        end_level=-1,
+        num_outs=5)
+    FPN(in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=0,
+        end_level=3,
+        num_outs=5)
+
+    # `num_outs` is not equal to end_level - start_level + 1
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            end_level=2,
+            num_outs=3)
+
+    # `num_outs` is not equal to len(in_channels) - start_level
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            num_outs=2)
+
+    # `end_level` is larger than len(in_channels) - 1
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            end_level=4,
+            num_outs=2)
+
+    # `num_outs` is not equal to end_level - start_level
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            end_level=3,
+            num_outs=1)
+
+    # Invalid `add_extra_convs` option
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            add_extra_convs='on_xxx',
+            num_outs=5)
+
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5)
+
+    # FPN expects a multiple levels of features per image
+    feats = [
+        torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+        for i in range(len(in_channels))
+    ]
+    outs = fpn_model(feats)
+    assert fpn_model.add_extra_convs == 'on_input'
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Tests for fpn with no extra convs (pooling is used instead)
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=False,
+        num_outs=5)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    assert not fpn_model.add_extra_convs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Tests for fpn with lateral bns
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        no_norm_on_lateral=False,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        num_outs=5)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    assert fpn_model.add_extra_convs == 'on_input'
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+    bn_exist = False
+    for m in fpn_model.modules():
+        if isinstance(m, _BatchNorm):
+            bn_exist = True
+    assert bn_exist
+
+    # Bilinear upsample
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        upsample_cfg=dict(mode='bilinear', align_corners=True),
+        num_outs=5)
+    fpn_model(feats)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    assert fpn_model.add_extra_convs == 'on_input'
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Scale factor instead of fixed upsample size upsample
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        upsample_cfg=dict(scale_factor=2),
+        num_outs=5)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Extra convs source is 'inputs'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs='on_input',
+        start_level=1,
+        num_outs=5)
+    assert fpn_model.add_extra_convs == 'on_input'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Extra convs source is 'laterals'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs='on_lateral',
+        start_level=1,
+        num_outs=5)
+    assert fpn_model.add_extra_convs == 'on_lateral'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Extra convs source is 'outputs'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs='on_output',
+        start_level=1,
+        num_outs=5)
+    assert fpn_model.add_extra_convs == 'on_output'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+
+def test_channel_mapper():
+    """Tests ChannelMapper."""
+    s = 64
+    in_channels = [8, 16, 32, 64]
+    feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+    out_channels = 8
+    kernel_size = 3
+    feats = [
+        torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+        for i in range(len(in_channels))
+    ]
+
+    # in_channels must be a list
+    with pytest.raises(AssertionError):
+        channel_mapper = ChannelMapper(
+            in_channels=10, out_channels=out_channels, kernel_size=kernel_size)
+    # the length of channel_mapper's inputs must be equal to the length of
+    # in_channels
+    with pytest.raises(AssertionError):
+        channel_mapper = ChannelMapper(
+            in_channels=in_channels[:-1],
+            out_channels=out_channels,
+            kernel_size=kernel_size)
+        channel_mapper(feats)
+
+    channel_mapper = ChannelMapper(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size)
+
+    outs = channel_mapper(feats)
+    assert len(outs) == len(feats)
+    for i in range(len(feats)):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+
+def test_dilated_encoder():
+    in_channels = 16
+    out_channels = 32
+    out_shape = 34
+    dilated_encoder = DilatedEncoder(in_channels, out_channels, 16, 2,
+                                     [2, 4, 6, 8])
+    feat = [torch.rand(1, in_channels, 34, 34)]
+    out_feat = dilated_encoder(feat)[0]
+    assert out_feat.shape == (1, out_channels, out_shape, out_shape)
+
+
+def test_ct_resnet_neck():
+    # num_filters/num_kernels must be a list
+    with pytest.raises(TypeError):
+        CTResNetNeck(
+            in_channel=10, num_deconv_filters=10, num_deconv_kernels=4)
+
+    # num_filters/num_kernels must be same length
+    with pytest.raises(AssertionError):
+        CTResNetNeck(
+            in_channel=10,
+            num_deconv_filters=(10, 10),
+            num_deconv_kernels=(4, ))
+
+    in_channels = 16
+    num_filters = (8, 8)
+    num_kernels = (4, 4)
+    feat = torch.rand(1, 16, 4, 4)
+    ct_resnet_neck = CTResNetNeck(
+        in_channel=in_channels,
+        num_deconv_filters=num_filters,
+        num_deconv_kernels=num_kernels,
+        use_dcn=False)
+
+    # feat must be list or tuple
+    with pytest.raises(AssertionError):
+        ct_resnet_neck(feat)
+
+    out_feat = ct_resnet_neck([feat])[0]
+    assert out_feat.shape == (1, num_filters[-1], 16, 16)
+
+    if torch.cuda.is_available():
+        # test dcn
+        ct_resnet_neck = CTResNetNeck(
+            in_channel=in_channels,
+            num_deconv_filters=num_filters,
+            num_deconv_kernels=num_kernels)
+        ct_resnet_neck = ct_resnet_neck.cuda()
+        feat = feat.cuda()
+        out_feat = ct_resnet_neck([feat])[0]
+        assert out_feat.shape == (1, num_filters[-1], 16, 16)
+
+
+def test_yolov3_neck():
+    # num_scales, in_channels, out_channels must be same length
+    with pytest.raises(AssertionError):
+        YOLOV3Neck(num_scales=3, in_channels=[16, 8, 4], out_channels=[8, 4])
+
+    # len(feats) must equal to num_scales
+    with pytest.raises(AssertionError):
+        neck = YOLOV3Neck(
+            num_scales=3, in_channels=[16, 8, 4], out_channels=[8, 4, 2])
+        feats = (torch.rand(1, 4, 16, 16), torch.rand(1, 8, 16, 16))
+        neck(feats)
+
+    # test normal channels
+    s = 32
+    in_channels = [16, 8, 4]
+    out_channels = [8, 4, 2]
+    feat_sizes = [s // 2**i for i in range(len(in_channels) - 1, -1, -1)]
+    feats = [
+        torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+        for i in range(len(in_channels) - 1, -1, -1)
+    ]
+    neck = YOLOV3Neck(
+        num_scales=3, in_channels=in_channels, out_channels=out_channels)
+    outs = neck(feats)
+
+    assert len(outs) == len(feats)
+    for i in range(len(outs)):
+        assert outs[i].shape == \
+               (1, out_channels[i], feat_sizes[i], feat_sizes[i])
+
+    # test more flexible setting
+    s = 32
+    in_channels = [32, 8, 16]
+    out_channels = [19, 21, 5]
+    feat_sizes = [s // 2**i for i in range(len(in_channels) - 1, -1, -1)]
+    feats = [
+        torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+        for i in range(len(in_channels) - 1, -1, -1)
+    ]
+    neck = YOLOV3Neck(
+        num_scales=3, in_channels=in_channels, out_channels=out_channels)
+    outs = neck(feats)
+
+    assert len(outs) == len(feats)
+    for i in range(len(outs)):
+        assert outs[i].shape == \
+               (1, out_channels[i], feat_sizes[i], feat_sizes[i])
+
+
+def test_ssd_neck():
+    # level_strides/level_paddings must be same length
+    with pytest.raises(AssertionError):
+        SSDNeck(
+            in_channels=[8, 16],
+            out_channels=[8, 16, 32],
+            level_strides=[2],
+            level_paddings=[2, 1])
+
+    # length of out_channels must larger than in_channels
+    with pytest.raises(AssertionError):
+        SSDNeck(
+            in_channels=[8, 16],
+            out_channels=[8],
+            level_strides=[2],
+            level_paddings=[2])
+
+    # len(out_channels) - len(in_channels) must equal to len(level_strides)
+    with pytest.raises(AssertionError):
+        SSDNeck(
+            in_channels=[8, 16],
+            out_channels=[4, 16, 64],
+            level_strides=[2, 2],
+            level_paddings=[2, 2])
+
+    # in_channels must be same with out_channels[:len(in_channels)]
+    with pytest.raises(AssertionError):
+        SSDNeck(
+            in_channels=[8, 16],
+            out_channels=[4, 16, 64],
+            level_strides=[2],
+            level_paddings=[2])
+
+    ssd_neck = SSDNeck(
+        in_channels=[4],
+        out_channels=[4, 8, 16],
+        level_strides=[2, 1],
+        level_paddings=[1, 0])
+    feats = (torch.rand(1, 4, 16, 16), )
+    outs = ssd_neck(feats)
+    assert outs[0].shape == (1, 4, 16, 16)
+    assert outs[1].shape == (1, 8, 8, 8)
+    assert outs[2].shape == (1, 16, 6, 6)
+
+    # test SSD-Lite Neck
+    ssd_neck = SSDNeck(
+        in_channels=[4, 8],
+        out_channels=[4, 8, 16],
+        level_strides=[1],
+        level_paddings=[1],
+        l2_norm_scale=None,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type='ReLU6'))
+    assert not hasattr(ssd_neck, 'l2_norm')
+
+    from mmcv.cnn.bricks import DepthwiseSeparableConvModule
+    assert isinstance(ssd_neck.extra_layers[0][-1],
+                      DepthwiseSeparableConvModule)
+
+    feats = (torch.rand(1, 4, 8, 8), torch.rand(1, 8, 8, 8))
+    outs = ssd_neck(feats)
+    assert outs[0].shape == (1, 4, 8, 8)
+    assert outs[1].shape == (1, 8, 8, 8)
+    assert outs[2].shape == (1, 16, 8, 8)
+
+
+def test_yolox_pafpn():
+    s = 64
+    in_channels = [8, 16, 32, 64]
+    feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+    out_channels = 24
+    feats = [
+        torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+        for i in range(len(in_channels))
+    ]
+    neck = YOLOXPAFPN(in_channels=in_channels, out_channels=out_channels)
+    outs = neck(feats)
+    assert len(outs) == len(feats)
+    for i in range(len(feats)):
+        assert outs[i].shape[1] == out_channels
+        assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # test depth-wise
+    neck = YOLOXPAFPN(
+        in_channels=in_channels, out_channels=out_channels, use_depthwise=True)
+
+    from mmcv.cnn.bricks import DepthwiseSeparableConvModule
+    assert isinstance(neck.downsamples[0], DepthwiseSeparableConvModule)
+
+    outs = neck(feats)
+    assert len(outs) == len(feats)
+    for i in range(len(feats)):
+        assert outs[i].shape[1] == out_channels
+        assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+
+def test_dyhead():
+    s = 64
+    in_channels = 8
+    out_channels = 16
+    feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+    feats = [
+        torch.rand(1, in_channels, feat_sizes[i], feat_sizes[i])
+        for i in range(len(feat_sizes))
+    ]
+    neck = DyHead(
+        in_channels=in_channels, out_channels=out_channels, num_blocks=3)
+    outs = neck(feats)
+    assert len(outs) == len(feats)
+    for i in range(len(outs)):
+        assert outs[i].shape[1] == out_channels
+        assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    feat = torch.rand(1, 8, 4, 4)
+    # input feat must be tuple or list
+    with pytest.raises(AssertionError):
+        neck(feat)
+
+
+def test_fpg():
+    # end_level=-1 is equal to end_level=3
+    norm_cfg = dict(type='BN', requires_grad=True)
+    FPG(in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        inter_channels=8,
+        num_outs=5,
+        add_extra_convs=True,
+        start_level=1,
+        end_level=-1,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])
+    FPG(in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        inter_channels=8,
+        num_outs=5,
+        add_extra_convs=True,
+        start_level=1,
+        end_level=3,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])
+
+    # `end_level` is larger than len(in_channels) - 1
+    with pytest.raises(AssertionError):
+        FPG(in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            stack_times=9,
+            paths=['bu'] * 9,
+            start_level=1,
+            end_level=4,
+            num_outs=2,
+            skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])
+
+    # `num_outs` is not equal to end_level - start_level + 1
+    with pytest.raises(AssertionError):
+        FPG(in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            stack_times=9,
+            paths=['bu'] * 9,
+            start_level=1,
+            end_level=2,
+            num_outs=3,
+            skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])
+
+
+def test_fpn_carafe():
+    # end_level=-1 is equal to end_level=3
+    FPN_CARAFE(
+        in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        start_level=0,
+        end_level=3,
+        num_outs=4)
+    FPN_CARAFE(
+        in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        start_level=0,
+        end_level=-1,
+        num_outs=4)
+    # `end_level` is larger than len(in_channels) - 1
+    with pytest.raises(AssertionError):
+        FPN_CARAFE(
+            in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            start_level=1,
+            end_level=4,
+            num_outs=2)
+
+    # `num_outs` is not equal to end_level - start_level + 1
+    with pytest.raises(AssertionError):
+        FPN_CARAFE(
+            in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            start_level=1,
+            end_level=2,
+            num_outs=3)
+
+
+def test_nas_fpn():
+    # end_level=-1 is equal to end_level=3
+    NASFPN(
+        in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        stack_times=9,
+        start_level=0,
+        end_level=3,
+        num_outs=4)
+    NASFPN(
+        in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        stack_times=9,
+        start_level=0,
+        end_level=-1,
+        num_outs=4)
+    # `end_level` is larger than len(in_channels) - 1
+    with pytest.raises(AssertionError):
+        NASFPN(
+            in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            stack_times=9,
+            start_level=1,
+            end_level=4,
+            num_outs=2)
+
+    # `num_outs` is not equal to end_level - start_level + 1
+    with pytest.raises(AssertionError):
+        NASFPN(
+            in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            stack_times=9,
+            start_level=1,
+            end_level=2,
+            num_outs=3)
+
+
+def test_nasfcos_fpn():
+    # end_level=-1 is equal to end_level=3
+    NASFCOS_FPN(
+        in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        start_level=0,
+        end_level=3,
+        num_outs=4)
+    NASFCOS_FPN(
+        in_channels=[8, 16, 32, 64],
+        out_channels=8,
+        start_level=0,
+        end_level=-1,
+        num_outs=4)
+
+    # `end_level` is larger than len(in_channels) - 1
+    with pytest.raises(AssertionError):
+        NASFCOS_FPN(
+            in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            start_level=1,
+            end_level=4,
+            num_outs=2)
+
+    # `num_outs` is not equal to end_level - start_level + 1
+    with pytest.raises(AssertionError):
+        NASFCOS_FPN(
+            in_channels=[8, 16, 32, 64],
+            out_channels=8,
+            start_level=1,
+            end_level=2,
+            num_outs=3)
diff --git a/tests/test_models/test_plugins.py b/tests/test_models/test_plugins.py
new file mode 100755
index 0000000..8afd1f9
--- /dev/null
+++ b/tests/test_models/test_plugins.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmcv import ConfigDict
+from mmcv.cnn import build_plugin_layer
+
+from mmdet.models.plugins import DropBlock
+
+
+def test_dropblock():
+    feat = torch.rand(1, 1, 11, 11)
+    drop_prob = 1.0
+    dropblock = DropBlock(drop_prob, block_size=11, warmup_iters=0)
+    out_feat = dropblock(feat)
+    assert (out_feat == 0).all() and out_feat.shape == feat.shape
+    drop_prob = 0.5
+    dropblock = DropBlock(drop_prob, block_size=5, warmup_iters=0)
+    out_feat = dropblock(feat)
+    assert out_feat.shape == feat.shape
+
+    # drop_prob must be (0,1]
+    with pytest.raises(AssertionError):
+        DropBlock(1.5, 3)
+
+    # block_size cannot be an even number
+    with pytest.raises(AssertionError):
+        DropBlock(0.5, 2)
+
+    # warmup_iters cannot be less than 0
+    with pytest.raises(AssertionError):
+        DropBlock(0.5, 3, -1)
+
+
+def test_pixel_decoder():
+    base_channels = 64
+    pixel_decoder_cfg = ConfigDict(
+        dict(
+            type='PixelDecoder',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')))
+    self = build_plugin_layer(pixel_decoder_cfg)[1]
+    img_metas = [{}, {}]
+    feats = [
+        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    mask_feature, memory = self(feats, img_metas)
+
+    assert (memory == feats[-1]).all()
+    assert mask_feature.shape == feats[0].shape
+
+
+def test_transformer_encoder_pixel_decoder():
+    base_channels = 64
+    pixel_decoder_cfg = ConfigDict(
+        dict(
+            type='TransformerEncoderPixelDecoder',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        attn_drop=0.1,
+                        proj_drop=0.1,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 8,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.1,
+                        dropout_layer=None,
+                        add_identity=True),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'),
+                    norm_cfg=dict(type='LN'),
+                    init_cfg=None,
+                    batch_first=False),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True)))
+    self = build_plugin_layer(pixel_decoder_cfg)[1]
+    img_metas = [{
+        'batch_input_shape': (128, 160),
+        'img_shape': (120, 160, 3),
+    }, {
+        'batch_input_shape': (128, 160),
+        'img_shape': (125, 160, 3),
+    }]
+    feats = [
+        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    mask_feature, memory = self(feats, img_metas)
+
+    assert memory.shape[-2:] == feats[-1].shape[-2:]
+    assert mask_feature.shape == feats[0].shape
+
+
+def test_msdeformattn_pixel_decoder():
+    base_channels = 64
+    pixel_decoder_cfg = ConfigDict(
+        dict(
+            type='MSDeformAttnPixelDecoder',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            strides=[4, 8, 16, 32],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 4,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True),
+            init_cfg=None), )
+    self = build_plugin_layer(pixel_decoder_cfg)[1]
+    feats = [
+        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    mask_feature, multi_scale_features = self(feats)
+
+    assert mask_feature.shape == feats[0].shape
+    assert len(multi_scale_features) == 3
+    multi_scale_features = multi_scale_features[::-1]
+    for i in range(3):
+        assert multi_scale_features[i].shape[-2:] == feats[i + 1].shape[-2:]
diff --git a/tests/test_models/test_roi_heads/__init__.py b/tests/test_models/test_roi_heads/__init__.py
new file mode 100755
index 0000000..83cfd58
--- /dev/null
+++ b/tests/test_models/test_roi_heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .utils import _dummy_bbox_sampling
+
+__all__ = ['_dummy_bbox_sampling']
diff --git a/tests/test_models/test_roi_heads/test_bbox_head.py b/tests/test_models/test_roi_heads/test_bbox_head.py
new file mode 100755
index 0000000..e839d06
--- /dev/null
+++ b/tests/test_models/test_roi_heads/test_bbox_head.py
@@ -0,0 +1,251 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pytest
+import torch
+
+from mmdet.core import bbox2roi
+from mmdet.models.roi_heads.bbox_heads import BBoxHead
+from .utils import _dummy_bbox_sampling
+
+
+def test_bbox_head_loss():
+    """Tests bbox head loss when truth is empty and non-empty."""
+    self = BBoxHead(in_channels=8, roi_feat_size=3)
+
+    # Dummy proposals
+    proposal_list = [
+        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874]]),
+    ]
+
+    target_cfg = mmcv.Config(dict(pos_weight=1))
+
+    # Test bbox loss when truth is empty
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
+                                            gt_labels)
+
+    bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels,
+                                    target_cfg)
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+
+    # Create dummy features "extracted" for each sampled bbox
+    num_sampled = sum(len(res.bboxes) for res in sampling_results)
+    rois = bbox2roi([res.bboxes for res in sampling_results])
+    dummy_feats = torch.rand(num_sampled, 8 * 3 * 3)
+    cls_scores, bbox_preds = self.forward(dummy_feats)
+
+    losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights,
+                       bbox_targets, bbox_weights)
+    assert losses.get('loss_cls', 0) > 0, 'cls-loss should be non-zero'
+    assert losses.get('loss_bbox', 0) == 0, 'empty gt loss should be zero'
+
+    # Test bbox loss when truth is non-empty
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+
+    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
+                                            gt_labels)
+    rois = bbox2roi([res.bboxes for res in sampling_results])
+
+    bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels,
+                                    target_cfg)
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+
+    # Create dummy features "extracted" for each sampled bbox
+    num_sampled = sum(len(res.bboxes) for res in sampling_results)
+    dummy_feats = torch.rand(num_sampled, 8 * 3 * 3)
+    cls_scores, bbox_preds = self.forward(dummy_feats)
+
+    losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights,
+                       bbox_targets, bbox_weights)
+    assert losses.get('loss_cls', 0) > 0, 'cls-loss should be non-zero'
+    assert losses.get('loss_bbox', 0) > 0, 'box-loss should be non-zero'
+
+
+@pytest.mark.parametrize('num_sample', [0, 1, 2])
+def test_bbox_head_get_bboxes(num_sample):
+    self = BBoxHead(reg_class_agnostic=True)
+
+    num_class = 6
+    rois = torch.rand((num_sample, 5))
+    cls_score = torch.rand((num_sample, num_class))
+    bbox_pred = torch.rand((num_sample, 4))
+
+    scale_factor = np.array([2.0, 2.0, 2.0, 2.0])
+    det_bboxes, det_labels = self.get_bboxes(
+        rois, cls_score, bbox_pred, None, scale_factor, rescale=True)
+    if num_sample == 0:
+        assert len(det_bboxes) == 0 and len(det_labels) == 0
+    else:
+        assert det_bboxes.shape == bbox_pred.shape
+        assert det_labels.shape == cls_score.shape
+
+
+def test_refine_boxes():
+    """Mirrors the doctest in
+    ``mmdet.models.bbox_heads.bbox_head.BBoxHead.refine_boxes`` but checks for
+    multiple values of n_roi / n_img."""
+    self = BBoxHead(reg_class_agnostic=True)
+
+    test_settings = [
+
+        # Corner case: less rois than images
+        {
+            'n_roi': 2,
+            'n_img': 4,
+            'rng': 34285940
+        },
+
+        # Corner case: no images
+        {
+            'n_roi': 0,
+            'n_img': 0,
+            'rng': 52925222
+        },
+
+        # Corner cases: few images / rois
+        {
+            'n_roi': 1,
+            'n_img': 1,
+            'rng': 1200281
+        },
+        {
+            'n_roi': 2,
+            'n_img': 1,
+            'rng': 1200282
+        },
+        {
+            'n_roi': 2,
+            'n_img': 2,
+            'rng': 1200283
+        },
+        {
+            'n_roi': 1,
+            'n_img': 2,
+            'rng': 1200284
+        },
+
+        # Corner case: no rois few images
+        {
+            'n_roi': 0,
+            'n_img': 1,
+            'rng': 23955860
+        },
+        {
+            'n_roi': 0,
+            'n_img': 2,
+            'rng': 25830516
+        },
+
+        # Corner case: no rois many images
+        {
+            'n_roi': 0,
+            'n_img': 10,
+            'rng': 671346
+        },
+        {
+            'n_roi': 0,
+            'n_img': 20,
+            'rng': 699807
+        },
+
+        # Corner case: cal_similarity num rois and images
+        {
+            'n_roi': 20,
+            'n_img': 20,
+            'rng': 1200238
+        },
+        {
+            'n_roi': 10,
+            'n_img': 20,
+            'rng': 1200238
+        },
+        {
+            'n_roi': 5,
+            'n_img': 5,
+            'rng': 1200238
+        },
+
+        # ----------------------------------
+        # Common case: more rois than images
+        {
+            'n_roi': 100,
+            'n_img': 1,
+            'rng': 337156
+        },
+        {
+            'n_roi': 150,
+            'n_img': 2,
+            'rng': 275898
+        },
+        {
+            'n_roi': 500,
+            'n_img': 5,
+            'rng': 4903221
+        },
+    ]
+
+    for demokw in test_settings:
+        try:
+            n_roi = demokw['n_roi']
+            n_img = demokw['n_img']
+            rng = demokw['rng']
+
+            print(f'Test refine_boxes case: {demokw!r}')
+            tup = _demodata_refine_boxes(n_roi, n_img, rng=rng)
+            rois, labels, bbox_preds, pos_is_gts, img_metas = tup
+            bboxes_list = self.refine_bboxes(rois, labels, bbox_preds,
+                                             pos_is_gts, img_metas)
+            assert len(bboxes_list) == n_img
+            assert sum(map(len, bboxes_list)) <= n_roi
+            assert all(b.shape[1] == 4 for b in bboxes_list)
+        except Exception:
+            print(f'Test failed with demokw={demokw!r}')
+            raise
+
+
+def _demodata_refine_boxes(n_roi, n_img, rng=0):
+    """Create random test data for the
+    ``mmdet.models.bbox_heads.bbox_head.BBoxHead.refine_boxes`` method."""
+    import numpy as np
+
+    from mmdet.core.bbox.demodata import ensure_rng, random_boxes
+    try:
+        import kwarray
+    except ImportError:
+        import pytest
+        pytest.skip('kwarray is required for this test')
+    scale = 512
+    rng = ensure_rng(rng)
+    img_metas = [{'img_shape': (scale, scale)} for _ in range(n_img)]
+    # Create rois in the expected format
+    roi_boxes = random_boxes(n_roi, scale=scale, rng=rng)
+    if n_img == 0:
+        assert n_roi == 0, 'cannot have any rois if there are no images'
+        img_ids = torch.empty((0, ), dtype=torch.long)
+        roi_boxes = torch.empty((0, 4), dtype=torch.float32)
+    else:
+        img_ids = rng.randint(0, n_img, (n_roi, ))
+        img_ids = torch.from_numpy(img_ids)
+    rois = torch.cat([img_ids[:, None].float(), roi_boxes], dim=1)
+    # Create other args
+    labels = rng.randint(0, 2, (n_roi, ))
+    labels = torch.from_numpy(labels).long()
+    bbox_preds = random_boxes(n_roi, scale=scale, rng=rng)
+    # For each image, pretend random positive boxes are gts
+    is_label_pos = (labels.numpy() > 0).astype(np.int)
+    lbl_per_img = kwarray.group_items(is_label_pos, img_ids.numpy())
+    pos_per_img = [sum(lbl_per_img.get(gid, [])) for gid in range(n_img)]
+    # randomly generate with numpy then sort with torch
+    _pos_is_gts = [
+        rng.randint(0, 2, (npos, )).astype(np.uint8) for npos in pos_per_img
+    ]
+    pos_is_gts = [
+        torch.from_numpy(p).sort(descending=True)[0] for p in _pos_is_gts
+    ]
+    return rois, labels, bbox_preds, pos_is_gts, img_metas
diff --git a/tests/test_models/test_roi_heads/test_mask_head.py b/tests/test_models/test_roi_heads/test_mask_head.py
new file mode 100755
index 0000000..89a476d
--- /dev/null
+++ b/tests/test_models/test_roi_heads/test_mask_head.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.models.roi_heads.mask_heads import (DynamicMaskHead, FCNMaskHead,
+                                               MaskIoUHead)
+from .utils import _dummy_bbox_sampling
+
+
+def test_mask_head_loss():
+    """Test mask head loss when mask target is empty."""
+    self = FCNMaskHead(
+        num_convs=1,
+        roi_feat_size=6,
+        in_channels=8,
+        conv_out_channels=8,
+        num_classes=8)
+
+    # Dummy proposals
+    proposal_list = [
+        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874]]),
+    ]
+
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
+                                            gt_labels)
+
+    # create dummy mask
+    import numpy as np
+
+    from mmdet.core import BitmapMasks
+    dummy_mask = np.random.randint(0, 2, (1, 160, 240), dtype=np.uint8)
+    gt_masks = [BitmapMasks(dummy_mask, 160, 240)]
+
+    # create dummy train_cfg
+    train_cfg = mmcv.Config(dict(mask_size=12, mask_thr_binary=0.5))
+
+    # Create dummy features "extracted" for each sampled bbox
+    num_sampled = sum(len(res.bboxes) for res in sampling_results)
+    dummy_feats = torch.rand(num_sampled, 8, 6, 6)
+
+    mask_pred = self.forward(dummy_feats)
+    mask_targets = self.get_targets(sampling_results, gt_masks, train_cfg)
+    pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+    loss_mask = self.loss(mask_pred, mask_targets, pos_labels)
+
+    onegt_mask_loss = sum(loss_mask['loss_mask'])
+    assert onegt_mask_loss.item() > 0, 'mask loss should be non-zero'
+
+    # test mask_iou_head
+    mask_iou_head = MaskIoUHead(
+        num_convs=1,
+        num_fcs=1,
+        roi_feat_size=6,
+        in_channels=8,
+        conv_out_channels=8,
+        fc_out_channels=8,
+        num_classes=8)
+
+    pos_mask_pred = mask_pred[range(mask_pred.size(0)), pos_labels]
+    mask_iou_pred = mask_iou_head(dummy_feats, pos_mask_pred)
+    pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)), pos_labels]
+
+    mask_iou_targets = mask_iou_head.get_targets(sampling_results, gt_masks,
+                                                 pos_mask_pred, mask_targets,
+                                                 train_cfg)
+    loss_mask_iou = mask_iou_head.loss(pos_mask_iou_pred, mask_iou_targets)
+    onegt_mask_iou_loss = loss_mask_iou['loss_mask_iou'].sum()
+    assert onegt_mask_iou_loss.item() >= 0
+
+    # test dynamic_mask_head
+    dummy_proposal_feats = torch.rand(num_sampled, 8)
+    dynamic_mask_head = DynamicMaskHead(
+        dynamic_conv_cfg=dict(
+            type='DynamicConv',
+            in_channels=8,
+            feat_channels=8,
+            out_channels=8,
+            input_feat_shape=6,
+            with_proj=False,
+            act_cfg=dict(type='ReLU', inplace=True),
+            norm_cfg=dict(type='LN')),
+        num_convs=1,
+        num_classes=8,
+        in_channels=8,
+        roi_feat_size=6)
+
+    mask_pred = dynamic_mask_head(dummy_feats, dummy_proposal_feats)
+
+    mask_target = dynamic_mask_head.get_targets(sampling_results, gt_masks,
+                                                train_cfg)
+    loss_mask = dynamic_mask_head.loss(mask_pred, mask_target, pos_labels)
+    loss_mask = loss_mask['loss_mask'].sum()
+    assert loss_mask.item() >= 0
diff --git a/tests/test_models/test_roi_heads/test_roi_extractor.py b/tests/test_models/test_roi_heads/test_roi_extractor.py
new file mode 100755
index 0000000..b79dff9
--- /dev/null
+++ b/tests/test_models/test_roi_heads/test_roi_extractor.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.roi_heads.roi_extractors import GenericRoIExtractor
+
+
+def test_groie():
+    # test with pre/post
+    cfg = dict(
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32],
+        pre_cfg=dict(
+            type='ConvModule',
+            in_channels=256,
+            out_channels=256,
+            kernel_size=5,
+            padding=2,
+            inplace=False,
+        ),
+        post_cfg=dict(
+            type='ConvModule',
+            in_channels=256,
+            out_channels=256,
+            kernel_size=5,
+            padding=2,
+            inplace=False))
+
+    groie = GenericRoIExtractor(**cfg)
+
+    feats = (
+        torch.rand((1, 256, 200, 336)),
+        torch.rand((1, 256, 100, 168)),
+        torch.rand((1, 256, 50, 84)),
+        torch.rand((1, 256, 25, 42)),
+    )
+
+    rois = torch.tensor([[0.0000, 587.8285, 52.1405, 886.2484, 341.5644]])
+
+    res = groie(feats, rois)
+    assert res.shape == torch.Size([1, 256, 7, 7])
+
+    # test w.o. pre/post
+    cfg = dict(
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32])
+
+    groie = GenericRoIExtractor(**cfg)
+
+    feats = (
+        torch.rand((1, 256, 200, 336)),
+        torch.rand((1, 256, 100, 168)),
+        torch.rand((1, 256, 50, 84)),
+        torch.rand((1, 256, 25, 42)),
+    )
+
+    rois = torch.tensor([[0.0000, 587.8285, 52.1405, 886.2484, 341.5644]])
+
+    res = groie(feats, rois)
+    assert res.shape == torch.Size([1, 256, 7, 7])
+
+    # test w.o. pre/post concat
+    cfg = dict(
+        aggregation='concat',
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+        out_channels=256 * 4,
+        featmap_strides=[4, 8, 16, 32])
+
+    groie = GenericRoIExtractor(**cfg)
+
+    feats = (
+        torch.rand((1, 256, 200, 336)),
+        torch.rand((1, 256, 100, 168)),
+        torch.rand((1, 256, 50, 84)),
+        torch.rand((1, 256, 25, 42)),
+    )
+
+    rois = torch.tensor([[0.0000, 587.8285, 52.1405, 886.2484, 341.5644]])
+
+    res = groie(feats, rois)
+    assert res.shape == torch.Size([1, 1024, 7, 7])
+
+    # test not supported aggregate method
+    with pytest.raises(AssertionError):
+        cfg = dict(
+            aggregation='not support',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=1024,
+            featmap_strides=[4, 8, 16, 32])
+        _ = GenericRoIExtractor(**cfg)
+
+    # test concat channels number
+    cfg = dict(
+        aggregation='concat',
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+        out_channels=256 * 5,  # 256*5 != 256*4
+        featmap_strides=[4, 8, 16, 32])
+
+    groie = GenericRoIExtractor(**cfg)
+
+    feats = (
+        torch.rand((1, 256, 200, 336)),
+        torch.rand((1, 256, 100, 168)),
+        torch.rand((1, 256, 50, 84)),
+        torch.rand((1, 256, 25, 42)),
+    )
+
+    rois = torch.tensor([[0.0000, 587.8285, 52.1405, 886.2484, 341.5644]])
+
+    # out_channels does not sum of feat channels
+    with pytest.raises(AssertionError):
+        _ = groie(feats, rois)
diff --git a/tests/test_models/test_roi_heads/test_sabl_bbox_head.py b/tests/test_models/test_roi_heads/test_sabl_bbox_head.py
new file mode 100755
index 0000000..d412e3a
--- /dev/null
+++ b/tests/test_models/test_roi_heads/test_sabl_bbox_head.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.core import bbox2roi
+from mmdet.models.roi_heads.bbox_heads import SABLHead
+from .utils import _dummy_bbox_sampling
+
+
+def test_sabl_bbox_head_loss():
+    """Tests bbox head loss when truth is empty and non-empty."""
+    self = SABLHead(
+        num_classes=4,
+        cls_in_channels=3,
+        reg_in_channels=3,
+        cls_out_channels=3,
+        reg_offset_out_channels=3,
+        reg_cls_out_channels=3,
+        roi_feat_size=7)
+
+    # Dummy proposals
+    proposal_list = [
+        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874]]),
+    ]
+
+    target_cfg = mmcv.Config(dict(pos_weight=1))
+
+    # Test bbox loss when truth is empty
+    gt_bboxes = [torch.empty((0, 4))]
+    gt_labels = [torch.LongTensor([])]
+
+    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
+                                            gt_labels)
+
+    bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels,
+                                    target_cfg)
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+
+    # Create dummy features "extracted" for each sampled bbox
+    num_sampled = sum(len(res.bboxes) for res in sampling_results)
+    rois = bbox2roi([res.bboxes for res in sampling_results])
+    dummy_feats = torch.rand(num_sampled, 3, 7, 7)
+    cls_scores, bbox_preds = self.forward(dummy_feats)
+
+    losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights,
+                       bbox_targets, bbox_weights)
+    assert losses.get('loss_cls', 0) > 0, 'cls-loss should be non-zero'
+    assert losses.get('loss_bbox_cls',
+                      0) == 0, 'empty gt bbox-cls-loss should be zero'
+    assert losses.get('loss_bbox_reg',
+                      0) == 0, 'empty gt bbox-reg-loss should be zero'
+
+    # Test bbox loss when truth is non-empty
+    gt_bboxes = [
+        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
+    ]
+    gt_labels = [torch.LongTensor([2])]
+
+    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
+                                            gt_labels)
+    rois = bbox2roi([res.bboxes for res in sampling_results])
+
+    bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels,
+                                    target_cfg)
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+
+    # Create dummy features "extracted" for each sampled bbox
+    num_sampled = sum(len(res.bboxes) for res in sampling_results)
+    dummy_feats = torch.rand(num_sampled, 3, 7, 7)
+    cls_scores, bbox_preds = self.forward(dummy_feats)
+
+    losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights,
+                       bbox_targets, bbox_weights)
+    assert losses.get('loss_bbox_cls',
+                      0) > 0, 'empty gt bbox-cls-loss should be zero'
+    assert losses.get('loss_bbox_reg',
+                      0) > 0, 'empty gt bbox-reg-loss should be zero'
diff --git a/tests/test_models/test_roi_heads/utils.py b/tests/test_models/test_roi_heads/utils.py
new file mode 100755
index 0000000..748cb0e
--- /dev/null
+++ b/tests/test_models/test_roi_heads/utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core import build_assigner, build_sampler
+
+
+def _dummy_bbox_sampling(proposal_list, gt_bboxes, gt_labels):
+    """Create sample results that can be passed to BBoxHead.get_targets."""
+    num_imgs = 1
+    feat = torch.rand(1, 1, 3, 3)
+    assign_config = dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.5,
+        ignore_iof_thr=-1)
+    sampler_config = dict(
+        type='RandomSampler',
+        num=512,
+        pos_fraction=0.25,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+    bbox_assigner = build_assigner(assign_config)
+    bbox_sampler = build_sampler(sampler_config)
+    gt_bboxes_ignore = [None for _ in range(num_imgs)]
+    sampling_results = []
+    for i in range(num_imgs):
+        assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i],
+                                             gt_bboxes_ignore[i], gt_labels[i])
+        sampling_result = bbox_sampler.sample(
+            assign_result,
+            proposal_list[i],
+            gt_bboxes[i],
+            gt_labels[i],
+            feats=feat)
+        sampling_results.append(sampling_result)
+
+    return sampling_results
diff --git a/tests/test_models/test_seg_heads/test_maskformer_fusion_head.py b/tests/test_models/test_seg_heads/test_maskformer_fusion_head.py
new file mode 100755
index 0000000..8d5131f
--- /dev/null
+++ b/tests/test_models/test_seg_heads/test_maskformer_fusion_head.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+from mmcv import ConfigDict
+
+from mmdet.models.seg_heads.panoptic_fusion_heads import MaskFormerFusionHead
+
+
+def test_maskformer_fusion_head():
+    img_metas = [
+        {
+            'batch_input_shape': (128, 160),
+            'img_shape': (126, 160, 3),
+            'ori_shape': (63, 80, 3),
+            'pad_shape': (128, 160, 3)
+        },
+    ]
+    num_things_classes = 80
+    num_stuff_classes = 53
+    num_classes = num_things_classes + num_stuff_classes
+    config = ConfigDict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        test_cfg=dict(
+            panoptic_on=True,
+            semantic_on=False,
+            instance_on=True,
+            max_per_image=100,
+            object_mask_thr=0.8,
+            iou_thr=0.8,
+            filter_low_score=False),
+        init_cfg=None)
+
+    self = MaskFormerFusionHead(**config)
+
+    # test forward_train
+    assert self.forward_train() == dict()
+
+    mask_cls_results = torch.rand((1, 100, num_classes + 1))
+    mask_pred_results = torch.rand((1, 100, 128, 160))
+
+    # test panoptic_postprocess and instance_postprocess
+    results = self.simple_test(mask_cls_results, mask_pred_results, img_metas)
+    assert 'ins_results' in results[0] and 'pan_results' in results[0]
+
+    # test semantic_postprocess
+    config.test_cfg.semantic_on = True
+    with pytest.raises(AssertionError):
+        self.simple_test(mask_cls_results, mask_pred_results, img_metas)
+
+    with pytest.raises(NotImplementedError):
+        self.semantic_postprocess(mask_cls_results, mask_pred_results)
diff --git a/tests/test_models/test_utils/test_brick_wrappers.py b/tests/test_models/test_utils/test_brick_wrappers.py
new file mode 100755
index 0000000..9aa5bd0
--- /dev/null
+++ b/tests/test_models/test_utils/test_brick_wrappers.py
@@ -0,0 +1,93 @@
+from unittest.mock import patch
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.models.utils import AdaptiveAvgPool2d, adaptive_avg_pool2d
+
+if torch.__version__ != 'parrots':
+    torch_version = '1.7'
+else:
+    torch_version = 'parrots'
+
+
+@patch('torch.__version__', torch_version)
+def test_adaptive_avg_pool2d():
+    # Test the empty batch dimension
+    # Test the two input conditions
+    x_empty = torch.randn(0, 3, 4, 5)
+    # 1. tuple[int, int]
+    wrapper_out = adaptive_avg_pool2d(x_empty, (2, 2))
+    assert wrapper_out.shape == (0, 3, 2, 2)
+    # 2. int
+    wrapper_out = adaptive_avg_pool2d(x_empty, 2)
+    assert wrapper_out.shape == (0, 3, 2, 2)
+
+    # wrapper op with 3-dim input
+    x_normal = torch.randn(3, 3, 4, 5)
+    wrapper_out = adaptive_avg_pool2d(x_normal, (2, 2))
+    ref_out = F.adaptive_avg_pool2d(x_normal, (2, 2))
+    assert wrapper_out.shape == (3, 3, 2, 2)
+    assert torch.equal(wrapper_out, ref_out)
+
+    wrapper_out = adaptive_avg_pool2d(x_normal, 2)
+    ref_out = F.adaptive_avg_pool2d(x_normal, 2)
+    assert wrapper_out.shape == (3, 3, 2, 2)
+    assert torch.equal(wrapper_out, ref_out)
+
+
+@patch('torch.__version__', torch_version)
+def test_AdaptiveAvgPool2d():
+    # Test the empty batch dimension
+    x_empty = torch.randn(0, 3, 4, 5)
+    # Test the four input conditions
+    # 1. tuple[int, int]
+    wrapper = AdaptiveAvgPool2d((2, 2))
+    wrapper_out = wrapper(x_empty)
+    assert wrapper_out.shape == (0, 3, 2, 2)
+
+    # 2. int
+    wrapper = AdaptiveAvgPool2d(2)
+    wrapper_out = wrapper(x_empty)
+    assert wrapper_out.shape == (0, 3, 2, 2)
+
+    # 3. tuple[None, int]
+    wrapper = AdaptiveAvgPool2d((None, 2))
+    wrapper_out = wrapper(x_empty)
+    assert wrapper_out.shape == (0, 3, 4, 2)
+
+    # 3. tuple[int, None]
+    wrapper = AdaptiveAvgPool2d((2, None))
+    wrapper_out = wrapper(x_empty)
+    assert wrapper_out.shape == (0, 3, 2, 5)
+
+    # Test the normal batch dimension
+    x_normal = torch.randn(3, 3, 4, 5)
+    wrapper = AdaptiveAvgPool2d((2, 2))
+    ref = nn.AdaptiveAvgPool2d((2, 2))
+    wrapper_out = wrapper(x_normal)
+    ref_out = ref(x_normal)
+    assert wrapper_out.shape == (3, 3, 2, 2)
+    assert torch.equal(wrapper_out, ref_out)
+
+    wrapper = AdaptiveAvgPool2d(2)
+    ref = nn.AdaptiveAvgPool2d(2)
+    wrapper_out = wrapper(x_normal)
+    ref_out = ref(x_normal)
+    assert wrapper_out.shape == (3, 3, 2, 2)
+    assert torch.equal(wrapper_out, ref_out)
+
+    wrapper = AdaptiveAvgPool2d((None, 2))
+    ref = nn.AdaptiveAvgPool2d((None, 2))
+    wrapper_out = wrapper(x_normal)
+    ref_out = ref(x_normal)
+    assert wrapper_out.shape == (3, 3, 4, 2)
+    assert torch.equal(wrapper_out, ref_out)
+
+    wrapper = AdaptiveAvgPool2d((2, None))
+    ref = nn.AdaptiveAvgPool2d((2, None))
+    wrapper_out = wrapper(x_normal)
+    ref_out = ref(x_normal)
+    assert wrapper_out.shape == (3, 3, 2, 5)
+    assert torch.equal(wrapper_out, ref_out)
diff --git a/tests/test_models/test_utils/test_conv_upsample.py b/tests/test_models/test_utils/test_conv_upsample.py
new file mode 100755
index 0000000..95a0ccc
--- /dev/null
+++ b/tests/test_models/test_utils/test_conv_upsample.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.utils import ConvUpsample
+
+
+@pytest.mark.parametrize('num_layers', [0, 1, 2])
+def test_conv_upsample(num_layers):
+    num_upsample = num_layers if num_layers > 0 else 0
+    num_layers = num_layers if num_layers > 0 else 1
+    layer = ConvUpsample(
+        10,
+        5,
+        num_layers=num_layers,
+        num_upsample=num_upsample,
+        conv_cfg=None,
+        norm_cfg=None)
+
+    size = 5
+    x = torch.randn((1, 10, size, size))
+    size = size * pow(2, num_upsample)
+    x = layer(x)
+    assert x.shape[-2:] == (size, size)
diff --git a/tests/test_models/test_utils/test_inverted_residual.py b/tests/test_models/test_utils/test_inverted_residual.py
new file mode 100755
index 0000000..14a331a
--- /dev/null
+++ b/tests/test_models/test_utils/test_inverted_residual.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmcv.cnn import is_norm
+from torch.nn.modules import GroupNorm
+
+from mmdet.models.utils import InvertedResidual, SELayer
+
+
+def test_inverted_residual():
+
+    with pytest.raises(AssertionError):
+        # stride must be in [1, 2]
+        InvertedResidual(16, 16, 32, stride=3)
+
+    with pytest.raises(AssertionError):
+        # se_cfg must be None or dict
+        InvertedResidual(16, 16, 32, se_cfg=list())
+
+    with pytest.raises(AssertionError):
+        # in_channeld and mid_channels must be the same if
+        # with_expand_conv is False
+        InvertedResidual(16, 16, 32, with_expand_conv=False)
+
+    # Test InvertedResidual forward, stride=1
+    block = InvertedResidual(16, 16, 32, stride=1)
+    x = torch.randn(1, 16, 56, 56)
+    x_out = block(x)
+    assert getattr(block, 'se', None) is None
+    assert block.with_res_shortcut
+    assert x_out.shape == torch.Size((1, 16, 56, 56))
+
+    # Test InvertedResidual forward, stride=2
+    block = InvertedResidual(16, 16, 32, stride=2)
+    x = torch.randn(1, 16, 56, 56)
+    x_out = block(x)
+    assert not block.with_res_shortcut
+    assert x_out.shape == torch.Size((1, 16, 28, 28))
+
+    # Test InvertedResidual forward with se layer
+    se_cfg = dict(channels=32)
+    block = InvertedResidual(16, 16, 32, stride=1, se_cfg=se_cfg)
+    x = torch.randn(1, 16, 56, 56)
+    x_out = block(x)
+    assert isinstance(block.se, SELayer)
+    assert x_out.shape == torch.Size((1, 16, 56, 56))
+
+    # Test InvertedResidual forward, with_expand_conv=False
+    block = InvertedResidual(32, 16, 32, with_expand_conv=False)
+    x = torch.randn(1, 32, 56, 56)
+    x_out = block(x)
+    assert getattr(block, 'expand_conv', None) is None
+    assert x_out.shape == torch.Size((1, 16, 56, 56))
+
+    # Test InvertedResidual forward with GroupNorm
+    block = InvertedResidual(
+        16, 16, 32, norm_cfg=dict(type='GN', num_groups=2))
+    x = torch.randn(1, 16, 56, 56)
+    x_out = block(x)
+    for m in block.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    assert x_out.shape == torch.Size((1, 16, 56, 56))
+
+    # Test InvertedResidual forward with HSigmoid
+    block = InvertedResidual(16, 16, 32, act_cfg=dict(type='HSigmoid'))
+    x = torch.randn(1, 16, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size((1, 16, 56, 56))
+
+    # Test InvertedResidual forward with checkpoint
+    block = InvertedResidual(16, 16, 32, with_cp=True)
+    x = torch.randn(1, 16, 56, 56)
+    x_out = block(x)
+    assert block.with_cp
+    assert x_out.shape == torch.Size((1, 16, 56, 56))
diff --git a/tests/test_models/test_utils/test_model_misc.py b/tests/test_models/test_utils/test_model_misc.py
new file mode 100755
index 0000000..93de336
--- /dev/null
+++ b/tests/test_models/test_utils/test_model_misc.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.autograd import gradcheck
+
+from mmdet.models.utils import interpolate_as, sigmoid_geometric_mean
+
+
+def test_interpolate_as():
+    source = torch.rand((1, 5, 4, 4))
+    target = torch.rand((1, 1, 16, 16))
+
+    # Test 4D source and target
+    result = interpolate_as(source, target)
+    assert result.shape == torch.Size((1, 5, 16, 16))
+
+    # Test 3D target
+    result = interpolate_as(source, target.squeeze(0))
+    assert result.shape == torch.Size((1, 5, 16, 16))
+
+    # Test 3D source
+    result = interpolate_as(source.squeeze(0), target)
+    assert result.shape == torch.Size((5, 16, 16))
+
+    # Test type(target) == np.ndarray
+    target = np.random.rand(16, 16)
+    result = interpolate_as(source.squeeze(0), target)
+    assert result.shape == torch.Size((5, 16, 16))
+
+
+def test_sigmoid_geometric_mean():
+    x = torch.randn(20, 20, dtype=torch.double, requires_grad=True)
+    y = torch.randn(20, 20, dtype=torch.double, requires_grad=True)
+    inputs = (x, y)
+    test = gradcheck(sigmoid_geometric_mean, inputs, eps=1e-6, atol=1e-4)
+    assert test
diff --git a/tests/test_models/test_utils/test_position_encoding.py b/tests/test_models/test_utils/test_position_encoding.py
new file mode 100755
index 0000000..1119410
--- /dev/null
+++ b/tests/test_models/test_utils/test_position_encoding.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.models.utils import (LearnedPositionalEncoding,
+                                SinePositionalEncoding)
+
+
+def test_sine_positional_encoding(num_feats=16, batch_size=2):
+    # test invalid type of scale
+    with pytest.raises(AssertionError):
+        module = SinePositionalEncoding(
+            num_feats, scale=(3., ), normalize=True)
+
+    module = SinePositionalEncoding(num_feats)
+    h, w = 10, 6
+    mask = (torch.rand(batch_size, h, w) > 0.5).to(torch.int)
+    assert not module.normalize
+    out = module(mask)
+    assert out.shape == (batch_size, num_feats * 2, h, w)
+
+    # set normalize
+    module = SinePositionalEncoding(num_feats, normalize=True)
+    assert module.normalize
+    out = module(mask)
+    assert out.shape == (batch_size, num_feats * 2, h, w)
+
+
+def test_learned_positional_encoding(num_feats=16,
+                                     row_num_embed=10,
+                                     col_num_embed=10,
+                                     batch_size=2):
+    module = LearnedPositionalEncoding(num_feats, row_num_embed, col_num_embed)
+    assert module.row_embed.weight.shape == (row_num_embed, num_feats)
+    assert module.col_embed.weight.shape == (col_num_embed, num_feats)
+    h, w = 10, 6
+    mask = torch.rand(batch_size, h, w) > 0.5
+    out = module(mask)
+    assert out.shape == (batch_size, num_feats * 2, h, w)
diff --git a/tests/test_models/test_utils/test_se_layer.py b/tests/test_models/test_utils/test_se_layer.py
new file mode 100755
index 0000000..b525b91
--- /dev/null
+++ b/tests/test_models/test_utils/test_se_layer.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import constant_init
+
+from mmdet.models.utils import DyReLU, SELayer
+
+
+def test_se_layer():
+    with pytest.raises(AssertionError):
+        # act_cfg sequence length must equal to 2
+        SELayer(channels=32, act_cfg=(dict(type='ReLU'), ))
+
+    with pytest.raises(AssertionError):
+        # act_cfg sequence must be a tuple of dict
+        SELayer(channels=32, act_cfg=[dict(type='ReLU'), dict(type='ReLU')])
+
+    # Test SELayer forward
+    layer = SELayer(channels=32)
+    layer.init_weights()
+    layer.train()
+
+    x = torch.randn((1, 32, 10, 10))
+    x_out = layer(x)
+    assert x_out.shape == torch.Size((1, 32, 10, 10))
+
+
+def test_dyrelu():
+    with pytest.raises(AssertionError):
+        # act_cfg sequence length must equal to 2
+        DyReLU(channels=32, act_cfg=(dict(type='ReLU'), ))
+
+    with pytest.raises(AssertionError):
+        # act_cfg sequence must be a tuple of dict
+        DyReLU(channels=32, act_cfg=[dict(type='ReLU'), dict(type='ReLU')])
+
+    # Test DyReLU forward
+    layer = DyReLU(channels=32)
+    layer.init_weights()
+    layer.train()
+    x = torch.randn((1, 32, 10, 10))
+    x_out = layer(x)
+    assert x_out.shape == torch.Size((1, 32, 10, 10))
+
+    # DyReLU should act as standard (static) ReLU
+    # when eliminating the effect of SE-like module
+    layer = DyReLU(channels=32)
+    constant_init(layer.conv2.conv, 0)
+    layer.train()
+    x = torch.randn((1, 32, 10, 10))
+    x_out = layer(x)
+    relu_out = F.relu(x)
+    assert torch.equal(x_out, relu_out)
diff --git a/tests/test_models/test_utils/test_transformer.py b/tests/test_models/test_utils/test_transformer.py
new file mode 100755
index 0000000..9c6efb4
--- /dev/null
+++ b/tests/test_models/test_utils/test_transformer.py
@@ -0,0 +1,569 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmcv.utils import ConfigDict
+
+from mmdet.models.utils.transformer import (AdaptivePadding,
+                                            DetrTransformerDecoder,
+                                            DetrTransformerEncoder, PatchEmbed,
+                                            PatchMerging, Transformer)
+
+
+def test_adaptive_padding():
+
+    for padding in ('same', 'corner'):
+        kernel_size = 16
+        stride = 16
+        dilation = 1
+        input = torch.rand(1, 1, 15, 17)
+        pool = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        out = pool(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+        input = torch.rand(1, 1, 16, 17)
+        out = pool(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+
+        kernel_size = (2, 2)
+        stride = (2, 2)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        # padding to divisible by 2
+        assert (out.shape[2], out.shape[3]) == (12, 14)
+
+        kernel_size = (2, 2)
+        stride = (10, 10)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 10, 13)
+        out = adap_pad(input)
+        #  no padding
+        assert (out.shape[2], out.shape[3]) == (10, 13)
+
+        kernel_size = (11, 11)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        #  all padding
+        assert (out.shape[2], out.shape[3]) == (21, 21)
+
+        # test padding as kernel is (7,9)
+        input = torch.rand(1, 1, 11, 13)
+        stride = (3, 4)
+        kernel_size = (4, 5)
+        dilation = (2, 2)
+        # actually (7, 9)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        dilation_out = adap_pad(input)
+        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
+        kernel_size = (7, 9)
+        dilation = (1, 1)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        kernel79_out = adap_pad(input)
+        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
+        assert kernel79_out.shape == dilation_out.shape
+
+    # assert only support "same" "corner"
+    with pytest.raises(AssertionError):
+        AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=1)
+
+
+def test_patch_embed():
+    B = 2
+    H = 3
+    W = 4
+    C = 3
+    embed_dims = 10
+    kernel_size = 3
+    stride = 1
+    dummy_input = torch.rand(B, C, H, W)
+    patch_merge_1 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=1,
+        norm_cfg=None)
+
+    x1, shape = patch_merge_1(dummy_input)
+    # test out shape
+    assert x1.shape == (2, 2, 10)
+    # test outsize is correct
+    assert shape == (1, 2)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x1.shape[1]
+
+    B = 2
+    H = 10
+    W = 10
+    C = 3
+    embed_dims = 10
+    kernel_size = 5
+    stride = 2
+    dummy_input = torch.rand(B, C, H, W)
+    # test dilation
+    patch_merge_2 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=None,
+    )
+
+    x2, shape = patch_merge_2(dummy_input)
+    # test out shape
+    assert x2.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x2.shape[1]
+
+    stride = 2
+    input_size = (10, 10)
+
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    x3, shape = patch_merge_3(dummy_input)
+    # test out shape
+    assert x3.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x3.shape[1]
+
+    # test the init_out_size with nn.Unfold
+    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    H = 11
+    W = 12
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    # test adap padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        embed_dims = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_patch_merging():
+
+    # Test the model with int padding
+    in_c = 3
+    out_c = 4
+    kernel_size = 3
+    stride = 3
+    padding = 1
+    dilation = 1
+    bias = False
+    # test the case `pad_to_stride` is False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 3
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 16, 4)
+    assert out_size == (4, 4)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+    in_c = 4
+    out_c = 5
+    kernel_size = 6
+    stride = 3
+    padding = 2
+    dilation = 2
+    bias = False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 4
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 4, 5)
+    assert out_size == (2, 2)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+
+    # Test with adaptive padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        out_c = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_detr_transformer_dencoder_encoder_layer():
+    config = ConfigDict(
+        dict(
+            return_intermediate=True,
+            num_layers=6,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1),
+                feedforward_channels=2048,
+                ffn_dropout=0.1,
+                operation_order=(
+                    'norm',
+                    'self_attn',
+                    'norm',
+                    'cross_attn',
+                    'norm',
+                    'ffn',
+                ))))
+    assert DetrTransformerDecoder(**config).layers[0].pre_norm
+    assert len(DetrTransformerDecoder(**config).layers) == 6
+
+    DetrTransformerDecoder(**config)
+    with pytest.raises(AssertionError):
+        config = ConfigDict(
+            dict(
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=[
+                    dict(
+                        type='DetrTransformerDecoderLayer',
+                        attn_cfgs=dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        feedforward_channels=2048,
+                        ffn_dropout=0.1,
+                        operation_order=('self_attn', 'norm', 'cross_attn',
+                                         'norm', 'ffn', 'norm'))
+                ] * 5))
+        DetrTransformerDecoder(**config)
+
+    config = ConfigDict(
+        dict(
+            num_layers=6,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1),
+                feedforward_channels=2048,
+                ffn_dropout=0.1,
+                operation_order=('norm', 'self_attn', 'norm', 'cross_attn',
+                                 'norm', 'ffn', 'norm'))))
+
+    with pytest.raises(AssertionError):
+        # len(operation_order) == 6
+        DetrTransformerEncoder(**config)
+
+
+def test_transformer():
+    config = ConfigDict(
+        dict(
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1)
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DetrTransformerDecoder',
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        dropout=0.1),
+                    feedforward_channels=2048,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')),
+            )))
+    transformer = Transformer(**config)
+    transformer.init_weights()
diff --git a/tests/test_onnx/__init__.py b/tests/test_onnx/__init__.py
new file mode 100755
index 0000000..76d466f
--- /dev/null
+++ b/tests/test_onnx/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .utils import ort_validate
+
+__all__ = ['ort_validate']
diff --git a/tests/test_onnx/data/fsaf_head_get_bboxes.pkl b/tests/test_onnx/data/fsaf_head_get_bboxes.pkl
new file mode 100755
index 0000000..a68a531
Binary files /dev/null and b/tests/test_onnx/data/fsaf_head_get_bboxes.pkl differ
diff --git a/tests/test_onnx/data/retina_head_get_bboxes.pkl b/tests/test_onnx/data/retina_head_get_bboxes.pkl
new file mode 100755
index 0000000..b6dbfe3
Binary files /dev/null and b/tests/test_onnx/data/retina_head_get_bboxes.pkl differ
diff --git a/tests/test_onnx/data/ssd_head_get_bboxes.pkl b/tests/test_onnx/data/ssd_head_get_bboxes.pkl
new file mode 100755
index 0000000..f2b2686
Binary files /dev/null and b/tests/test_onnx/data/ssd_head_get_bboxes.pkl differ
diff --git a/tests/test_onnx/data/yolov3_head_get_bboxes.pkl b/tests/test_onnx/data/yolov3_head_get_bboxes.pkl
new file mode 100755
index 0000000..9860492
Binary files /dev/null and b/tests/test_onnx/data/yolov3_head_get_bboxes.pkl differ
diff --git a/tests/test_onnx/data/yolov3_neck.pkl b/tests/test_onnx/data/yolov3_neck.pkl
new file mode 100755
index 0000000..f39346d
Binary files /dev/null and b/tests/test_onnx/data/yolov3_neck.pkl differ
diff --git a/tests/test_onnx/test_head.py b/tests/test_onnx/test_head.py
new file mode 100755
index 0000000..978c46a
--- /dev/null
+++ b/tests/test_onnx/test_head.py
@@ -0,0 +1,453 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from functools import partial
+
+import mmcv
+import numpy as np
+import pytest
+import torch
+from mmcv.cnn import Scale
+
+from mmdet import digit_version
+from mmdet.models import build_detector
+from mmdet.models.dense_heads import (FCOSHead, FSAFHead, RetinaHead, SSDHead,
+                                      YOLOV3Head)
+from .utils import ort_validate
+
+data_path = osp.join(osp.dirname(__file__), 'data')
+
+if digit_version(torch.__version__) <= digit_version('1.5.0'):
+    pytest.skip(
+        'ort backend does not support version below 1.5.0',
+        allow_module_level=True)
+
+
+def test_cascade_onnx_export():
+
+    config_path = './configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py'
+    cfg = mmcv.Config.fromfile(config_path)
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    with torch.no_grad():
+        model.forward = partial(model.forward, img_metas=[[dict()]])
+
+        dynamic_axes = {
+            'input_img': {
+                0: 'batch',
+                2: 'width',
+                3: 'height'
+            },
+            'dets': {
+                0: 'batch',
+                1: 'num_dets',
+            },
+            'labels': {
+                0: 'batch',
+                1: 'num_dets',
+            },
+        }
+        torch.onnx.export(
+            model, [torch.rand(1, 3, 400, 500)],
+            'tmp.onnx',
+            output_names=['dets', 'labels'],
+            input_names=['input_img'],
+            keep_initializers_as_inputs=True,
+            do_constant_folding=True,
+            verbose=False,
+            opset_version=11,
+            dynamic_axes=dynamic_axes)
+
+
+def test_faster_onnx_export():
+
+    config_path = './configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
+    cfg = mmcv.Config.fromfile(config_path)
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    with torch.no_grad():
+        model.forward = partial(model.forward, img_metas=[[dict()]])
+
+        dynamic_axes = {
+            'input_img': {
+                0: 'batch',
+                2: 'width',
+                3: 'height'
+            },
+            'dets': {
+                0: 'batch',
+                1: 'num_dets',
+            },
+            'labels': {
+                0: 'batch',
+                1: 'num_dets',
+            },
+        }
+        torch.onnx.export(
+            model, [torch.rand(1, 3, 400, 500)],
+            'tmp.onnx',
+            output_names=['dets', 'labels'],
+            input_names=['input_img'],
+            keep_initializers_as_inputs=True,
+            do_constant_folding=True,
+            verbose=False,
+            opset_version=11,
+            dynamic_axes=dynamic_axes)
+
+
+def retinanet_config():
+    """RetinanNet Head Config."""
+    head_cfg = dict(
+        stacked_convs=6,
+        feat_channels=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]))
+
+    test_cfg = mmcv.Config(
+        dict(
+            deploy_nms_pre=0,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100))
+
+    model = RetinaHead(
+        num_classes=4, in_channels=1, test_cfg=test_cfg, **head_cfg)
+    model.requires_grad_(False)
+
+    return model
+
+
+def test_retina_head_forward_single():
+    """Test RetinaNet Head single forward in torch and onnxruntime env."""
+    retina_model = retinanet_config()
+
+    feat = torch.rand(1, retina_model.in_channels, 32, 32)
+    # validate the result between the torch and ort
+    ort_validate(retina_model.forward_single, feat)
+
+
+def test_retina_head_forward():
+    """Test RetinaNet Head forward in torch and onnxruntime env."""
+    retina_model = retinanet_config()
+    s = 128
+    # RetinaNet head expects a multiple levels of features per image
+    feats = [
+        torch.rand(1, retina_model.in_channels, s // (2**(i + 2)),
+                   s // (2**(i + 2)))  # [32, 16, 8, 4, 2]
+        for i in range(len(retina_model.prior_generator.strides))
+    ]
+    ort_validate(retina_model.forward, feats)
+
+
+def test_retinanet_head_onnx_export():
+    """Test RetinaNet Head _get_bboxes() in torch and onnxruntime env."""
+    retina_model = retinanet_config()
+    s = 128
+    img_metas = [{
+        'img_shape_for_onnx': torch.Tensor([s, s]),
+        'scale_factor': np.ones(4),
+        'pad_shape': (s, s, 3),
+        'img_shape': (s, s, 2)
+    }]
+
+    # The data of retina_head_get_bboxes.pkl contains two parts:
+    # cls_score(list(Tensor)) and bboxes(list(Tensor)),
+    # where each torch.Tensor is generated by torch.rand().
+    # the cls_score's size: (1, 36, 32, 32), (1, 36, 16, 16),
+    # (1, 36, 8, 8), (1, 36, 4, 4), (1, 36, 2, 2).
+    # the bboxes's size: (1, 36, 32, 32), (1, 36, 16, 16),
+    # (1, 36, 8, 8), (1, 36, 4, 4), (1, 36, 2, 2)
+    retina_head_data = 'retina_head_get_bboxes.pkl'
+    feats = mmcv.load(osp.join(data_path, retina_head_data))
+    cls_score = feats[:5]
+    bboxes = feats[5:]
+
+    retina_model.onnx_export = partial(
+        retina_model.onnx_export, img_metas=img_metas, with_nms=False)
+    ort_validate(retina_model.onnx_export, (cls_score, bboxes))
+
+
+def yolo_config():
+    """YoloV3 Head Config."""
+    head_cfg = dict(
+        anchor_generator=dict(
+            type='YOLOAnchorGenerator',
+            base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                        [(30, 61), (62, 45), (59, 119)],
+                        [(10, 13), (16, 30), (33, 23)]],
+            strides=[32, 16, 8]),
+        bbox_coder=dict(type='YOLOBBoxCoder'))
+
+    test_cfg = mmcv.Config(
+        dict(
+            deploy_nms_pre=0,
+            min_bbox_size=0,
+            score_thr=0.05,
+            conf_thr=0.005,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=100))
+
+    model = YOLOV3Head(
+        num_classes=4,
+        in_channels=[1, 1, 1],
+        out_channels=[16, 8, 4],
+        test_cfg=test_cfg,
+        **head_cfg)
+    model.requires_grad_(False)
+    # yolov3 need eval()
+    model.cpu().eval()
+    return model
+
+
+def test_yolov3_head_forward():
+    """Test Yolov3 head forward() in torch and ort env."""
+    yolo_model = yolo_config()
+
+    # Yolov3 head expects a multiple levels of features per image
+    feats = [
+        torch.rand(1, 1, 64 // (2**(i + 2)), 64 // (2**(i + 2)))
+        for i in range(len(yolo_model.in_channels))
+    ]
+    ort_validate(yolo_model.forward, feats)
+
+
+def test_yolov3_head_onnx_export():
+    """Test yolov3 head get_bboxes() in torch and ort env."""
+    yolo_model = yolo_config()
+    s = 128
+    img_metas = [{
+        'img_shape_for_onnx': torch.Tensor([s, s]),
+        'img_shape': (s, s, 3),
+        'scale_factor': np.ones(4),
+        'pad_shape': (s, s, 3)
+    }]
+
+    # The data of yolov3_head_get_bboxes.pkl contains
+    # a list of torch.Tensor, where each torch.Tensor
+    # is generated by torch.rand and each tensor size is:
+    # (1, 27, 32, 32), (1, 27, 16, 16), (1, 27, 8, 8).
+    yolo_head_data = 'yolov3_head_get_bboxes.pkl'
+    pred_maps = mmcv.load(osp.join(data_path, yolo_head_data))
+
+    yolo_model.onnx_export = partial(
+        yolo_model.onnx_export, img_metas=img_metas, with_nms=False)
+    ort_validate(yolo_model.onnx_export, pred_maps)
+
+
+def fcos_config():
+    """FCOS Head Config."""
+    test_cfg = mmcv.Config(
+        dict(
+            deploy_nms_pre=0,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100))
+
+    model = FCOSHead(num_classes=4, in_channels=1, test_cfg=test_cfg)
+
+    model.requires_grad_(False)
+    return model
+
+
+def test_fcos_head_forward_single():
+    """Test fcos forward single in torch and ort env."""
+    fcos_model = fcos_config()
+
+    feat = torch.rand(1, fcos_model.in_channels, 32, 32)
+    fcos_model.forward_single = partial(
+        fcos_model.forward_single,
+        scale=Scale(1.0).requires_grad_(False),
+        stride=(4, ))
+    ort_validate(fcos_model.forward_single, feat)
+
+
+def test_fcos_head_forward():
+    """Test fcos forward in mutil-level feature map."""
+    fcos_model = fcos_config()
+    s = 128
+    feats = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    ort_validate(fcos_model.forward, feats)
+
+
+def test_fcos_head_onnx_export():
+    """Test fcos head get_bboxes() in ort."""
+    fcos_model = fcos_config()
+    s = 128
+    img_metas = [{
+        'img_shape_for_onnx': torch.Tensor([s, s]),
+        'img_shape': (s, s, 3),
+        'scale_factor': np.ones(4),
+        'pad_shape': (s, s, 3)
+    }]
+
+    cls_scores = [
+        torch.rand(1, fcos_model.num_classes, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    bboxes = [
+        torch.rand(1, 4, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+    centerness = [
+        torch.rand(1, 1, s // feat_size, s // feat_size)
+        for feat_size in [4, 8, 16, 32, 64]
+    ]
+
+    fcos_model.onnx_export = partial(
+        fcos_model.onnx_export, img_metas=img_metas, with_nms=False)
+    ort_validate(fcos_model.onnx_export, (cls_scores, bboxes, centerness))
+
+
+def fsaf_config():
+    """FSAF Head Config."""
+    cfg = dict(
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=1,
+            scales_per_octave=1,
+            ratios=[1.0],
+            strides=[8, 16, 32, 64, 128]))
+
+    test_cfg = mmcv.Config(
+        dict(
+            deploy_nms_pre=0,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100))
+
+    model = FSAFHead(num_classes=4, in_channels=1, test_cfg=test_cfg, **cfg)
+    model.requires_grad_(False)
+    return model
+
+
+def test_fsaf_head_forward_single():
+    """Test RetinaNet Head forward_single() in torch and onnxruntime env."""
+    fsaf_model = fsaf_config()
+
+    feat = torch.rand(1, fsaf_model.in_channels, 32, 32)
+    ort_validate(fsaf_model.forward_single, feat)
+
+
+def test_fsaf_head_forward():
+    """Test RetinaNet Head forward in torch and onnxruntime env."""
+    fsaf_model = fsaf_config()
+    s = 128
+    feats = [
+        torch.rand(1, fsaf_model.in_channels, s // (2**(i + 2)),
+                   s // (2**(i + 2)))
+        for i in range(len(fsaf_model.anchor_generator.strides))
+    ]
+    ort_validate(fsaf_model.forward, feats)
+
+
+def test_fsaf_head_onnx_export():
+    """Test RetinaNet Head get_bboxes in torch and onnxruntime env."""
+    fsaf_model = fsaf_config()
+    s = 256
+    img_metas = [{
+        'img_shape_for_onnx': torch.Tensor([s, s]),
+        'scale_factor': np.ones(4),
+        'pad_shape': (s, s, 3),
+        'img_shape': (s, s, 2)
+    }]
+
+    # The data of fsaf_head_get_bboxes.pkl contains two parts:
+    # cls_score(list(Tensor)) and bboxes(list(Tensor)),
+    # where each torch.Tensor is generated by torch.rand().
+    # the cls_score's size: (1, 4, 64, 64), (1, 4, 32, 32),
+    # (1, 4, 16, 16), (1, 4, 8, 8), (1, 4, 4, 4).
+    # the bboxes's size: (1, 4, 64, 64), (1, 4, 32, 32),
+    # (1, 4, 16, 16), (1, 4, 8, 8), (1, 4, 4, 4).
+    fsaf_head_data = 'fsaf_head_get_bboxes.pkl'
+    feats = mmcv.load(osp.join(data_path, fsaf_head_data))
+    cls_score = feats[:5]
+    bboxes = feats[5:]
+
+    fsaf_model.onnx_export = partial(
+        fsaf_model.onnx_export, img_metas=img_metas, with_nms=False)
+    ort_validate(fsaf_model.onnx_export, (cls_score, bboxes))
+
+
+def ssd_config():
+    """SSD Head Config."""
+    cfg = dict(
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=300,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]))
+
+    test_cfg = mmcv.Config(
+        dict(
+            deploy_nms_pre=0,
+            nms=dict(type='nms', iou_threshold=0.45),
+            min_bbox_size=0,
+            score_thr=0.02,
+            max_per_img=200))
+
+    model = SSDHead(
+        num_classes=4,
+        in_channels=(4, 8, 4, 2, 2, 2),
+        test_cfg=test_cfg,
+        **cfg)
+
+    model.requires_grad_(False)
+    return model
+
+
+def test_ssd_head_forward():
+    """Test SSD Head forward in torch and onnxruntime env."""
+    ssd_model = ssd_config()
+
+    featmap_size = [38, 19, 10, 6, 5, 3, 1]
+
+    feats = [
+        torch.rand(1, ssd_model.in_channels[i], featmap_size[i],
+                   featmap_size[i]) for i in range(len(ssd_model.in_channels))
+    ]
+    ort_validate(ssd_model.forward, feats)
+
+
+def test_ssd_head_onnx_export():
+    """Test SSD Head get_bboxes in torch and onnxruntime env."""
+    ssd_model = ssd_config()
+    s = 300
+    img_metas = [{
+        'img_shape_for_onnx': torch.Tensor([s, s]),
+        'scale_factor': np.ones(4),
+        'pad_shape': (s, s, 3),
+        'img_shape': (s, s, 2)
+    }]
+
+    # The data of ssd_head_get_bboxes.pkl contains two parts:
+    # cls_score(list(Tensor)) and bboxes(list(Tensor)),
+    # where each torch.Tensor is generated by torch.rand().
+    # the cls_score's size: (1, 20, 38, 38), (1, 30, 19, 19),
+    # (1, 30, 10, 10), (1, 30, 5, 5), (1, 20, 3, 3), (1, 20, 1, 1).
+    # the bboxes's size: (1, 16, 38, 38), (1, 24, 19, 19),
+    # (1, 24, 10, 10), (1, 24, 5, 5), (1, 16, 3, 3), (1, 16, 1, 1).
+    ssd_head_data = 'ssd_head_get_bboxes.pkl'
+    feats = mmcv.load(osp.join(data_path, ssd_head_data))
+    cls_score = feats[:6]
+    bboxes = feats[6:]
+
+    ssd_model.onnx_export = partial(
+        ssd_model.onnx_export, img_metas=img_metas, with_nms=False)
+    ort_validate(ssd_model.onnx_export, (cls_score, bboxes))
diff --git a/tests/test_onnx/test_neck.py b/tests/test_onnx/test_neck.py
new file mode 100755
index 0000000..a1a5cc8
--- /dev/null
+++ b/tests/test_onnx/test_neck.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmcv
+import pytest
+import torch
+
+from mmdet import digit_version
+from mmdet.models.necks import FPN, YOLOV3Neck
+from .utils import ort_validate
+
+if digit_version(torch.__version__) <= digit_version('1.5.0'):
+    pytest.skip(
+        'ort backend does not support version below 1.5.0',
+        allow_module_level=True)
+
+# Control the returned model of fpn_neck_config()
+fpn_test_step_names = {
+    'fpn_normal': 0,
+    'fpn_wo_extra_convs': 1,
+    'fpn_lateral_bns': 2,
+    'fpn_bilinear_upsample': 3,
+    'fpn_scale_factor': 4,
+    'fpn_extra_convs_inputs': 5,
+    'fpn_extra_convs_laterals': 6,
+    'fpn_extra_convs_outputs': 7,
+}
+
+# Control the returned model of yolo_neck_config()
+yolo_test_step_names = {'yolo_normal': 0}
+
+data_path = osp.join(osp.dirname(__file__), 'data')
+
+
+def fpn_neck_config(test_step_name):
+    """Return the class containing the corresponding attributes according to
+    the fpn_test_step_names."""
+    s = 64
+    in_channels = [8, 16, 32, 64]
+    feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+    out_channels = 8
+
+    feats = [
+        torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+        for i in range(len(in_channels))
+    ]
+
+    if (fpn_test_step_names[test_step_name] == 0):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs=True,
+            num_outs=5)
+    elif (fpn_test_step_names[test_step_name] == 1):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs=False,
+            num_outs=5)
+    elif (fpn_test_step_names[test_step_name] == 2):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs=True,
+            no_norm_on_lateral=False,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            num_outs=5)
+    elif (fpn_test_step_names[test_step_name] == 3):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs=True,
+            upsample_cfg=dict(mode='bilinear', align_corners=True),
+            num_outs=5)
+    elif (fpn_test_step_names[test_step_name] == 4):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs=True,
+            upsample_cfg=dict(scale_factor=2),
+            num_outs=5)
+    elif (fpn_test_step_names[test_step_name] == 5):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs='on_input',
+            num_outs=5)
+    elif (fpn_test_step_names[test_step_name] == 6):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs='on_lateral',
+            num_outs=5)
+    elif (fpn_test_step_names[test_step_name] == 7):
+        fpn_model = FPN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_extra_convs='on_output',
+            num_outs=5)
+    return fpn_model, feats
+
+
+def yolo_neck_config(test_step_name):
+    """Config yolov3 Neck."""
+    in_channels = [16, 8, 4]
+    out_channels = [8, 4, 2]
+
+    # The data of yolov3_neck.pkl contains a list of
+    # torch.Tensor, where each torch.Tensor is generated by
+    # torch.rand and each tensor size is:
+    # (1, 4, 64, 64), (1, 8, 32, 32), (1, 16, 16, 16).
+    yolov3_neck_data = 'yolov3_neck.pkl'
+    feats = mmcv.load(osp.join(data_path, yolov3_neck_data))
+
+    if (yolo_test_step_names[test_step_name] == 0):
+        yolo_model = YOLOV3Neck(
+            in_channels=in_channels, out_channels=out_channels, num_scales=3)
+    return yolo_model, feats
+
+
+def test_fpn_normal():
+    outs = fpn_neck_config('fpn_normal')
+    ort_validate(*outs)
+
+
+def test_fpn_wo_extra_convs():
+    outs = fpn_neck_config('fpn_wo_extra_convs')
+    ort_validate(*outs)
+
+
+def test_fpn_lateral_bns():
+    outs = fpn_neck_config('fpn_lateral_bns')
+    ort_validate(*outs)
+
+
+def test_fpn_bilinear_upsample():
+    outs = fpn_neck_config('fpn_bilinear_upsample')
+    ort_validate(*outs)
+
+
+def test_fpn_scale_factor():
+    outs = fpn_neck_config('fpn_scale_factor')
+    ort_validate(*outs)
+
+
+def test_fpn_extra_convs_inputs():
+    outs = fpn_neck_config('fpn_extra_convs_inputs')
+    ort_validate(*outs)
+
+
+def test_fpn_extra_convs_laterals():
+    outs = fpn_neck_config('fpn_extra_convs_laterals')
+    ort_validate(*outs)
+
+
+def test_fpn_extra_convs_outputs():
+    outs = fpn_neck_config('fpn_extra_convs_outputs')
+    ort_validate(*outs)
+
+
+def test_yolo_normal():
+    outs = yolo_neck_config('yolo_normal')
+    ort_validate(*outs)
diff --git a/tests/test_onnx/utils.py b/tests/test_onnx/utils.py
new file mode 100755
index 0000000..ad95e9e
--- /dev/null
+++ b/tests/test_onnx/utils.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import warnings
+
+import numpy as np
+import onnx
+import onnxruntime as ort
+import torch
+import torch.nn as nn
+
+ort_custom_op_path = ''
+try:
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+except (ImportError, ModuleNotFoundError):
+    warnings.warn('If input model has custom op from mmcv, \
+        you may have to build mmcv with ONNXRuntime from source.')
+
+
+class WrapFunction(nn.Module):
+    """Wrap the function to be tested for torch.onnx.export tracking."""
+
+    def __init__(self, wrapped_function):
+        super(WrapFunction, self).__init__()
+        self.wrapped_function = wrapped_function
+
+    def forward(self, *args, **kwargs):
+        return self.wrapped_function(*args, **kwargs)
+
+
+def ort_validate(model, feats, onnx_io='tmp.onnx'):
+    """Validate the output of the onnxruntime backend is the same as the output
+    generated by torch.
+
+    Args:
+        model (nn.Module | function): the function of model or model
+            to be verified.
+        feats (tuple(list(torch.Tensor)) | list(torch.Tensor) | torch.Tensor):
+            the input of model.
+        onnx_io (str): the name of onnx output file.
+    """
+    # if model is not an instance of nn.Module, then it is a normal
+    # function and it should be wrapped.
+    if isinstance(model, nn.Module):
+        wrap_model = model
+    else:
+        wrap_model = WrapFunction(model)
+    wrap_model.cpu().eval()
+    with torch.no_grad():
+        torch.onnx.export(
+            wrap_model,
+            feats,
+            onnx_io,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            do_constant_folding=True,
+            verbose=False,
+            opset_version=11)
+
+    if isinstance(feats, tuple):
+        ort_feats = []
+        for feat in feats:
+            ort_feats += feat
+    else:
+        ort_feats = feats
+    # default model name: tmp.onnx
+    onnx_outputs = get_ort_model_output(ort_feats)
+
+    # remove temp file
+    if osp.exists(onnx_io):
+        os.remove(onnx_io)
+
+    if isinstance(feats, tuple):
+        torch_outputs = convert_result_list(wrap_model.forward(*feats))
+    else:
+        torch_outputs = convert_result_list(wrap_model.forward(feats))
+    torch_outputs = [
+        torch_output.detach().numpy() for torch_output in torch_outputs
+    ]
+
+    # match torch_outputs and onnx_outputs
+    for i in range(len(onnx_outputs)):
+        np.testing.assert_allclose(
+            torch_outputs[i], onnx_outputs[i], rtol=1e-03, atol=1e-05)
+
+
+def get_ort_model_output(feat, onnx_io='tmp.onnx'):
+    """Run the model in onnxruntime env.
+
+    Args:
+        feat (list[Tensor]): A list of tensors from torch.rand,
+            each is a 4D-tensor.
+
+    Returns:
+        list[np.array]: onnxruntime infer result, each is a np.array
+    """
+
+    onnx_model = onnx.load(onnx_io)
+    onnx.checker.check_model(onnx_model)
+
+    session_options = ort.SessionOptions()
+    # register custom op for onnxruntime
+    if osp.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+    sess = ort.InferenceSession(onnx_io, session_options)
+    if isinstance(feat, torch.Tensor):
+        onnx_outputs = sess.run(None,
+                                {sess.get_inputs()[0].name: feat.numpy()})
+    else:
+        onnx_outputs = sess.run(None, {
+            sess.get_inputs()[i].name: feat[i].numpy()
+            for i in range(len(feat))
+        })
+    return onnx_outputs
+
+
+def convert_result_list(outputs):
+    """Convert the torch forward outputs containing tuple or list to a list
+    only containing torch.Tensor.
+
+    Args:
+        output (list(Tensor) | tuple(list(Tensor) | ...): the outputs
+        in torch env, maybe containing nested structures such as list
+        or tuple.
+
+    Returns:
+        list(Tensor): a list only containing torch.Tensor
+    """
+    # recursive end condition
+    if isinstance(outputs, torch.Tensor):
+        return [outputs]
+
+    ret = []
+    for sub in outputs:
+        ret += convert_result_list(sub)
+    return ret
diff --git a/tests/test_runtime/async_benchmark.py b/tests/test_runtime/async_benchmark.py
new file mode 100755
index 0000000..aa692c4
--- /dev/null
+++ b/tests/test_runtime/async_benchmark.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+import os
+import shutil
+import urllib
+
+import mmcv
+import torch
+
+from mmdet.apis import (async_inference_detector, inference_detector,
+                        init_detector)
+from mmdet.utils.contextmanagers import concurrent
+from mmdet.utils.profiling import profile_time
+
+
+async def main():
+    """Benchmark between async and synchronous inference interfaces.
+
+    Sample runs for 20 demo images on K80 GPU, model - mask_rcnn_r50_fpn_1x:
+
+    async       sync
+
+    7981.79 ms  9660.82 ms
+    8074.52 ms  9660.94 ms
+    7976.44 ms  9406.83 ms
+
+    Async variant takes about 0.83-0.85 of the time of the synchronous
+    interface.
+    """
+    project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+    project_dir = os.path.join(project_dir, '..')
+
+    config_file = os.path.join(
+        project_dir, 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py')
+    checkpoint_file = os.path.join(
+        project_dir,
+        'checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth')
+
+    if not os.path.exists(checkpoint_file):
+        url = ('https://download.openmmlab.com/mmdetection/v2.0'
+               '/mask_rcnn/mask_rcnn_r50_fpn_1x_coco'
+               '/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth')
+        print(f'Downloading {url} ...')
+        local_filename, _ = urllib.request.urlretrieve(url)
+        os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)
+        shutil.move(local_filename, checkpoint_file)
+        print(f'Saved as {checkpoint_file}')
+    else:
+        print(f'Using existing checkpoint {checkpoint_file}')
+
+    device = 'cuda:0'
+    model = init_detector(
+        config_file, checkpoint=checkpoint_file, device=device)
+
+    # queue is used for concurrent inference of multiple images
+    streamqueue = asyncio.Queue()
+    # queue size defines concurrency level
+    streamqueue_size = 4
+
+    for _ in range(streamqueue_size):
+        streamqueue.put_nowait(torch.cuda.Stream(device=device))
+
+    # test a single image and show the results
+    img = mmcv.imread(os.path.join(project_dir, 'demo/demo.jpg'))
+
+    # warmup
+    await async_inference_detector(model, img)
+
+    async def detect(img):
+        async with concurrent(streamqueue):
+            return await async_inference_detector(model, img)
+
+    num_of_images = 20
+    with profile_time('benchmark', 'async'):
+        tasks = [
+            asyncio.create_task(detect(img)) for _ in range(num_of_images)
+        ]
+        async_results = await asyncio.gather(*tasks)
+
+    with torch.cuda.stream(torch.cuda.default_stream()):
+        with profile_time('benchmark', 'sync'):
+            sync_results = [
+                inference_detector(model, img) for _ in range(num_of_images)
+            ]
+
+    result_dir = os.path.join(project_dir, 'demo')
+    model.show_result(
+        img,
+        async_results[0],
+        score_thr=0.5,
+        show=False,
+        out_file=os.path.join(result_dir, 'result_async.jpg'))
+    model.show_result(
+        img,
+        sync_results[0],
+        score_thr=0.5,
+        show=False,
+        out_file=os.path.join(result_dir, 'result_sync.jpg'))
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/tests/test_runtime/test_apis.py b/tests/test_runtime/test_apis.py
new file mode 100755
index 0000000..2394d12
--- /dev/null
+++ b/tests/test_runtime/test_apis.py
@@ -0,0 +1,32 @@
+import os
+from pathlib import Path
+
+import pytest
+
+from mmdet.apis import init_detector
+
+
+def test_init_detector():
+    project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+    project_dir = os.path.join(project_dir, '..')
+
+    config_file = os.path.join(
+        project_dir, 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py')
+
+    # test init_detector with config_file: str and cfg_options
+    cfg_options = dict(
+        model=dict(
+            backbone=dict(
+                depth=18,
+                init_cfg=dict(
+                    type='Pretrained', checkpoint='torchvision://resnet18'))))
+    model = init_detector(config_file, device='cpu', cfg_options=cfg_options)
+
+    # test init_detector with :obj:`Path`
+    config_path_object = Path(config_file)
+    model = init_detector(config_path_object, device='cpu')
+
+    # test init_detector with undesirable type
+    with pytest.raises(TypeError):
+        config_list = [config_file]
+        model = init_detector(config_list)  # noqa: F841
diff --git a/tests/test_runtime/test_async.py b/tests/test_runtime/test_async.py
new file mode 100755
index 0000000..1af1501
--- /dev/null
+++ b/tests/test_runtime/test_async.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests for async interface."""
+
+import asyncio
+import os
+import sys
+
+import asynctest
+import mmcv
+import torch
+
+from mmdet.apis import async_inference_detector, init_detector
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import concurrent
+
+
+class AsyncTestCase(asynctest.TestCase):
+    use_default_loop = False
+    forbid_get_event_loop = True
+
+    TEST_TIMEOUT = int(os.getenv('ASYNCIO_TEST_TIMEOUT', '30'))
+
+    def _run_test_method(self, method):
+        result = method()
+        if asyncio.iscoroutine(result):
+            self.loop.run_until_complete(
+                asyncio.wait_for(result, timeout=self.TEST_TIMEOUT))
+
+
+class MaskRCNNDetector:
+
+    def __init__(self,
+                 model_config,
+                 checkpoint=None,
+                 streamqueue_size=3,
+                 device='cuda:0'):
+
+        self.streamqueue_size = streamqueue_size
+        self.device = device
+        # build the model and load checkpoint
+        self.model = init_detector(
+            model_config, checkpoint=None, device=self.device)
+        self.streamqueue = None
+
+    async def init(self):
+        self.streamqueue = asyncio.Queue()
+        for _ in range(self.streamqueue_size):
+            stream = torch.cuda.Stream(device=self.device)
+            self.streamqueue.put_nowait(stream)
+
+    if sys.version_info >= (3, 7):
+
+        async def apredict(self, img):
+            if isinstance(img, str):
+                img = mmcv.imread(img)
+            async with concurrent(self.streamqueue):
+                result = await async_inference_detector(self.model, img)
+            return result
+
+
+class AsyncInferenceTestCase(AsyncTestCase):
+
+    if sys.version_info >= (3, 7):
+
+        async def test_simple_inference(self):
+            if not torch.cuda.is_available():
+                import pytest
+
+                pytest.skip('test requires GPU and torch+cuda')
+
+            ori_grad_enabled = torch.is_grad_enabled()
+            root_dir = os.path.dirname(os.path.dirname(__name__))
+            model_config = os.path.join(
+                root_dir, 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py')
+            detector = MaskRCNNDetector(model_config)
+            await detector.init()
+            img_path = os.path.join(root_dir, 'demo/demo.jpg')
+            bboxes, _ = await detector.apredict(img_path)
+            self.assertTrue(bboxes)
+            # asy inference detector will hack grad_enabled,
+            # so restore here to avoid it to influence other tests
+            torch.set_grad_enabled(ori_grad_enabled)
diff --git a/tests/test_runtime/test_config.py b/tests/test_runtime/test_config.py
new file mode 100755
index 0000000..dce88f4
--- /dev/null
+++ b/tests/test_runtime/test_config.py
@@ -0,0 +1,373 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os.path import dirname, exists, join
+from unittest.mock import Mock
+
+import pytest
+
+from mmdet.core import BitmapMasks, PolygonMasks
+from mmdet.datasets.builder import DATASETS
+from mmdet.datasets.utils import NumClassCheckHook
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection repo
+        repo_dpath = dirname(dirname(__file__))
+        repo_dpath = join(repo_dpath, '..')
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet
+        repo_dpath = dirname(dirname(mmdet.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _check_numclasscheckhook(detector, config_mod):
+    dummy_runner = Mock()
+    dummy_runner.model = detector
+
+    def get_dataset_name_classes(dataset):
+        # deal with `RepeatDataset`,`ConcatDataset`,`ClassBalancedDataset`..
+        if isinstance(dataset, (list, tuple)):
+            dataset = dataset[0]
+        while ('dataset' in dataset):
+            dataset = dataset['dataset']
+            # ConcatDataset
+            if isinstance(dataset, (list, tuple)):
+                dataset = dataset[0]
+        return dataset['type'], dataset.get('classes', None)
+
+    compatible_check = NumClassCheckHook()
+    dataset_name, CLASSES = get_dataset_name_classes(
+        config_mod['data']['train'])
+    if CLASSES is None:
+        CLASSES = DATASETS.get(dataset_name).CLASSES
+    dummy_runner.data_loader.dataset.CLASSES = CLASSES
+    compatible_check.before_train_epoch(dummy_runner)
+
+    dummy_runner.data_loader.dataset.CLASSES = None
+    compatible_check.before_train_epoch(dummy_runner)
+
+    dataset_name, CLASSES = get_dataset_name_classes(config_mod['data']['val'])
+    if CLASSES is None:
+        CLASSES = DATASETS.get(dataset_name).CLASSES
+    dummy_runner.data_loader.dataset.CLASSES = CLASSES
+    compatible_check.before_val_epoch(dummy_runner)
+    dummy_runner.data_loader.dataset.CLASSES = None
+    compatible_check.before_val_epoch(dummy_runner)
+
+
+def _check_roi_head(config, head):
+    # check consistency between head_config and roi_head
+    assert config['type'] == head.__class__.__name__
+
+    # check roi_align
+    bbox_roi_cfg = config.bbox_roi_extractor
+    bbox_roi_extractor = head.bbox_roi_extractor
+    _check_roi_extractor(bbox_roi_cfg, bbox_roi_extractor)
+
+    # check bbox head infos
+    bbox_cfg = config.bbox_head
+    bbox_head = head.bbox_head
+    _check_bbox_head(bbox_cfg, bbox_head)
+
+    if head.with_mask:
+        # check roi_align
+        if config.mask_roi_extractor:
+            mask_roi_cfg = config.mask_roi_extractor
+            mask_roi_extractor = head.mask_roi_extractor
+            _check_roi_extractor(mask_roi_cfg, mask_roi_extractor,
+                                 bbox_roi_extractor)
+
+        # check mask head infos
+        mask_head = head.mask_head
+        mask_cfg = config.mask_head
+        _check_mask_head(mask_cfg, mask_head)
+
+    # check arch specific settings, e.g., cascade/htc
+    if config['type'] in ['CascadeRoIHead', 'HybridTaskCascadeRoIHead']:
+        assert config.num_stages == len(head.bbox_head)
+        assert config.num_stages == len(head.bbox_roi_extractor)
+
+        if head.with_mask:
+            assert config.num_stages == len(head.mask_head)
+            assert config.num_stages == len(head.mask_roi_extractor)
+
+    elif config['type'] in ['MaskScoringRoIHead']:
+        assert (hasattr(head, 'mask_iou_head')
+                and head.mask_iou_head is not None)
+        mask_iou_cfg = config.mask_iou_head
+        mask_iou_head = head.mask_iou_head
+        assert (mask_iou_cfg.fc_out_channels ==
+                mask_iou_head.fc_mask_iou.in_features)
+
+    elif config['type'] in ['GridRoIHead']:
+        grid_roi_cfg = config.grid_roi_extractor
+        grid_roi_extractor = head.grid_roi_extractor
+        _check_roi_extractor(grid_roi_cfg, grid_roi_extractor,
+                             bbox_roi_extractor)
+
+        config.grid_head.grid_points = head.grid_head.grid_points
+
+
+def _check_roi_extractor(config, roi_extractor, prev_roi_extractor=None):
+    import torch.nn as nn
+
+    # Separate roi_extractor and prev_roi_extractor checks for flexibility
+    if isinstance(roi_extractor, nn.ModuleList):
+        roi_extractor = roi_extractor[0]
+    if prev_roi_extractor and isinstance(prev_roi_extractor, nn.ModuleList):
+        prev_roi_extractor = prev_roi_extractor[0]
+
+    assert (len(config.featmap_strides) == len(roi_extractor.roi_layers))
+    assert (config.out_channels == roi_extractor.out_channels)
+    from torch.nn.modules.utils import _pair
+    assert (_pair(config.roi_layer.output_size) ==
+            roi_extractor.roi_layers[0].output_size)
+
+    if 'use_torchvision' in config.roi_layer:
+        assert (config.roi_layer.use_torchvision ==
+                roi_extractor.roi_layers[0].use_torchvision)
+    elif 'aligned' in config.roi_layer:
+        assert (
+            config.roi_layer.aligned == roi_extractor.roi_layers[0].aligned)
+
+    if prev_roi_extractor:
+        assert (roi_extractor.roi_layers[0].aligned ==
+                prev_roi_extractor.roi_layers[0].aligned)
+        assert (roi_extractor.roi_layers[0].use_torchvision ==
+                prev_roi_extractor.roi_layers[0].use_torchvision)
+
+
+def _check_mask_head(mask_cfg, mask_head):
+    import torch.nn as nn
+    if isinstance(mask_cfg, list):
+        for single_mask_cfg, single_mask_head in zip(mask_cfg, mask_head):
+            _check_mask_head(single_mask_cfg, single_mask_head)
+    elif isinstance(mask_head, nn.ModuleList):
+        for single_mask_head in mask_head:
+            _check_mask_head(mask_cfg, single_mask_head)
+    else:
+        assert mask_cfg['type'] == mask_head.__class__.__name__
+        assert mask_cfg.in_channels == mask_head.in_channels
+        class_agnostic = mask_cfg.get('class_agnostic', False)
+        out_dim = (1 if class_agnostic else mask_cfg.num_classes)
+        if hasattr(mask_head, 'conv_logits'):
+            assert (mask_cfg.conv_out_channels ==
+                    mask_head.conv_logits.in_channels)
+            assert mask_head.conv_logits.out_channels == out_dim
+        else:
+            assert mask_cfg.fc_out_channels == mask_head.fc_logits.in_features
+            assert (mask_head.fc_logits.out_features == out_dim *
+                    mask_head.output_area)
+
+
+def _check_bbox_head(bbox_cfg, bbox_head):
+    import torch.nn as nn
+    if isinstance(bbox_cfg, list):
+        for single_bbox_cfg, single_bbox_head in zip(bbox_cfg, bbox_head):
+            _check_bbox_head(single_bbox_cfg, single_bbox_head)
+    elif isinstance(bbox_head, nn.ModuleList):
+        for single_bbox_head in bbox_head:
+            _check_bbox_head(bbox_cfg, single_bbox_head)
+    else:
+        assert bbox_cfg['type'] == bbox_head.__class__.__name__
+        if bbox_cfg['type'] == 'SABLHead':
+            assert bbox_cfg.cls_in_channels == bbox_head.cls_in_channels
+            assert bbox_cfg.reg_in_channels == bbox_head.reg_in_channels
+
+            cls_out_channels = bbox_cfg.get('cls_out_channels', 1024)
+            assert (cls_out_channels == bbox_head.fc_cls.in_features)
+            assert (bbox_cfg.num_classes + 1 == bbox_head.fc_cls.out_features)
+
+        elif bbox_cfg['type'] == 'DIIHead':
+            assert bbox_cfg['num_ffn_fcs'] == bbox_head.ffn.num_fcs
+            # 3 means FC and LN and Relu
+            assert bbox_cfg['num_cls_fcs'] == len(bbox_head.cls_fcs) // 3
+            assert bbox_cfg['num_reg_fcs'] == len(bbox_head.reg_fcs) // 3
+            assert bbox_cfg['in_channels'] == bbox_head.in_channels
+            assert bbox_cfg['in_channels'] == bbox_head.fc_cls.in_features
+            assert bbox_cfg['in_channels'] == bbox_head.fc_reg.in_features
+            assert bbox_cfg['in_channels'] == bbox_head.attention.embed_dims
+            assert bbox_cfg[
+                'feedforward_channels'] == bbox_head.ffn.feedforward_channels
+
+        else:
+            assert bbox_cfg.in_channels == bbox_head.in_channels
+            with_cls = bbox_cfg.get('with_cls', True)
+
+            if with_cls:
+                fc_out_channels = bbox_cfg.get('fc_out_channels', 2048)
+                assert (fc_out_channels == bbox_head.fc_cls.in_features)
+                if bbox_head.custom_cls_channels:
+                    assert (bbox_head.loss_cls.get_cls_channels(
+                        bbox_head.num_classes) == bbox_head.fc_cls.out_features
+                            )
+                else:
+                    assert (bbox_cfg.num_classes +
+                            1 == bbox_head.fc_cls.out_features)
+            with_reg = bbox_cfg.get('with_reg', True)
+            if with_reg:
+                out_dim = (4 if bbox_cfg.reg_class_agnostic else 4 *
+                           bbox_cfg.num_classes)
+                assert bbox_head.fc_reg.out_features == out_dim
+
+
+def _check_anchorhead(config, head):
+    # check consistency between head_config and roi_head
+    assert config['type'] == head.__class__.__name__
+    assert config.in_channels == head.in_channels
+
+    num_classes = (
+        config.num_classes -
+        1 if config.loss_cls.get('use_sigmoid', False) else config.num_classes)
+    if config['type'] == 'ATSSHead':
+        assert (config.feat_channels == head.atss_cls.in_channels)
+        assert (config.feat_channels == head.atss_reg.in_channels)
+        assert (config.feat_channels == head.atss_centerness.in_channels)
+    elif config['type'] == 'SABLRetinaHead':
+        assert (config.feat_channels == head.retina_cls.in_channels)
+        assert (config.feat_channels == head.retina_bbox_reg.in_channels)
+        assert (config.feat_channels == head.retina_bbox_cls.in_channels)
+    else:
+        assert (config.in_channels == head.conv_cls.in_channels)
+        assert (config.in_channels == head.conv_reg.in_channels)
+        assert (head.conv_cls.out_channels == num_classes * head.num_anchors)
+        assert head.fc_reg.out_channels == 4 * head.num_anchors
+
+
+# Only tests a representative subset of configurations
+# TODO: test pipelines using Albu, current Albu throw None given empty GT
+@pytest.mark.parametrize(
+    'config_rpath',
+    [
+        'wider_face/ssd300_wider_face.py',
+        'pascal_voc/ssd300_voc0712.py',
+        'pascal_voc/ssd512_voc0712.py',
+        # 'albu_example/mask_rcnn_r50_fpn_1x.py',
+        'foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco.py',
+        'mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py',
+        'mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain_1x_coco.py',
+        'mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py'
+    ])
+def test_config_data_pipeline(config_rpath):
+    """Test whether the data pipeline is valid and can process corner cases.
+
+    CommandLine:
+        xdoctest -m tests/test_runtime/
+            test_config.py test_config_build_data_pipeline
+    """
+    import numpy as np
+    from mmcv import Config
+
+    from mmdet.datasets.pipelines import Compose
+
+    config_dpath = _get_config_directory()
+    print(f'Found config_dpath = {config_dpath}')
+
+    def dummy_masks(h, w, num_obj=3, mode='bitmap'):
+        assert mode in ('polygon', 'bitmap')
+        if mode == 'bitmap':
+            masks = np.random.randint(0, 2, (num_obj, h, w), dtype=np.uint8)
+            masks = BitmapMasks(masks, h, w)
+        else:
+            masks = []
+            for i in range(num_obj):
+                masks.append([])
+                masks[-1].append(
+                    np.random.uniform(0, min(h - 1, w - 1), (8 + 4 * i, )))
+                masks[-1].append(
+                    np.random.uniform(0, min(h - 1, w - 1), (10 + 4 * i, )))
+            masks = PolygonMasks(masks, h, w)
+        return masks
+
+    config_fpath = join(config_dpath, config_rpath)
+    cfg = Config.fromfile(config_fpath)
+
+    # remove loading pipeline
+    loading_pipeline = cfg.train_pipeline.pop(0)
+    loading_ann_pipeline = cfg.train_pipeline.pop(0)
+    cfg.test_pipeline.pop(0)
+
+    train_pipeline = Compose(cfg.train_pipeline)
+    test_pipeline = Compose(cfg.test_pipeline)
+
+    print(f'Building data pipeline, config_fpath = {config_fpath}')
+
+    print(f'Test training data pipeline: \n{train_pipeline!r}')
+    img = np.random.randint(0, 255, size=(888, 666, 3), dtype=np.uint8)
+    if loading_pipeline.get('to_float32', False):
+        img = img.astype(np.float32)
+    mode = 'bitmap' if loading_ann_pipeline.get('poly2mask',
+                                                True) else 'polygon'
+    results = dict(
+        filename='test_img.png',
+        ori_filename='test_img.png',
+        img=img,
+        img_shape=img.shape,
+        ori_shape=img.shape,
+        gt_bboxes=np.array([[35.2, 11.7, 39.7, 15.7]], dtype=np.float32),
+        gt_labels=np.array([1], dtype=np.int64),
+        gt_masks=dummy_masks(img.shape[0], img.shape[1], mode=mode),
+    )
+    results['img_fields'] = ['img']
+    results['bbox_fields'] = ['gt_bboxes']
+    results['mask_fields'] = ['gt_masks']
+    output_results = train_pipeline(results)
+    assert output_results is not None
+
+    print(f'Test testing data pipeline: \n{test_pipeline!r}')
+    results = dict(
+        filename='test_img.png',
+        ori_filename='test_img.png',
+        img=img,
+        img_shape=img.shape,
+        ori_shape=img.shape,
+        gt_bboxes=np.array([[35.2, 11.7, 39.7, 15.7]], dtype=np.float32),
+        gt_labels=np.array([1], dtype=np.int64),
+        gt_masks=dummy_masks(img.shape[0], img.shape[1], mode=mode),
+    )
+    results['img_fields'] = ['img']
+    results['bbox_fields'] = ['gt_bboxes']
+    results['mask_fields'] = ['gt_masks']
+    output_results = test_pipeline(results)
+    assert output_results is not None
+
+    # test empty GT
+    print('Test empty GT with training data pipeline: '
+          f'\n{train_pipeline!r}')
+    results = dict(
+        filename='test_img.png',
+        ori_filename='test_img.png',
+        img=img,
+        img_shape=img.shape,
+        ori_shape=img.shape,
+        gt_bboxes=np.zeros((0, 4), dtype=np.float32),
+        gt_labels=np.array([], dtype=np.int64),
+        gt_masks=dummy_masks(img.shape[0], img.shape[1], num_obj=0, mode=mode),
+    )
+    results['img_fields'] = ['img']
+    results['bbox_fields'] = ['gt_bboxes']
+    results['mask_fields'] = ['gt_masks']
+    output_results = train_pipeline(results)
+    assert output_results is not None
+
+    print(f'Test empty GT with testing data pipeline: \n{test_pipeline!r}')
+    results = dict(
+        filename='test_img.png',
+        ori_filename='test_img.png',
+        img=img,
+        img_shape=img.shape,
+        ori_shape=img.shape,
+        gt_bboxes=np.zeros((0, 4), dtype=np.float32),
+        gt_labels=np.array([], dtype=np.int64),
+        gt_masks=dummy_masks(img.shape[0], img.shape[1], num_obj=0, mode=mode),
+    )
+    results['img_fields'] = ['img']
+    results['bbox_fields'] = ['gt_bboxes']
+    results['mask_fields'] = ['gt_masks']
+    output_results = test_pipeline(results)
+    assert output_results is not None
diff --git a/tests/test_runtime/test_eval_hook.py b/tests/test_runtime/test_eval_hook.py
new file mode 100755
index 0000000..ac0f5e9
--- /dev/null
+++ b/tests/test_runtime/test_eval_hook.py
@@ -0,0 +1,252 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest.mock as mock
+from collections import OrderedDict
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+from mmcv.runner import EpochBasedRunner, build_optimizer
+from mmcv.utils import get_logger
+from torch.utils.data import DataLoader, Dataset
+
+from mmdet.core import DistEvalHook, EvalHook
+
+
+class ExampleDataset(Dataset):
+
+    def __init__(self):
+        self.index = 0
+        self.eval_result = [0.1, 0.4, 0.3, 0.7, 0.2, 0.05, 0.4, 0.6]
+
+    def __getitem__(self, idx):
+        results = dict(imgs=torch.tensor([1]))
+        return results
+
+    def __len__(self):
+        return 1
+
+    @mock.create_autospec
+    def evaluate(self, results, logger=None):
+        pass
+
+
+class EvalDataset(ExampleDataset):
+
+    def evaluate(self, results, logger=None):
+        mean_ap = self.eval_result[self.index]
+        output = OrderedDict(mAP=mean_ap, index=self.index, score=mean_ap)
+        self.index += 1
+        return output
+
+
+class ExampleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Linear(1, 1)
+        self.test_cfg = None
+
+    def forward(self, imgs, rescale=False, return_loss=False):
+        return imgs
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        outputs = {
+            'loss': 0.5,
+            'log_vars': {
+                'accuracy': 0.98
+            },
+            'num_samples': 1
+        }
+        return outputs
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@patch('mmdet.apis.single_gpu_test', MagicMock)
+@patch('mmdet.apis.multi_gpu_test', MagicMock)
+@pytest.mark.parametrize('EvalHookCls', (EvalHook, DistEvalHook))
+def test_eval_hook(EvalHookCls):
+    with pytest.raises(TypeError):
+        # dataloader must be a pytorch DataLoader
+        test_dataset = ExampleDataset()
+        data_loader = [
+            DataLoader(
+                test_dataset,
+                batch_size=1,
+                sampler=None,
+                num_worker=0,
+                shuffle=False)
+        ]
+        EvalHookCls(data_loader)
+
+    with pytest.raises(KeyError):
+        # rule must be in keys of rule_map
+        test_dataset = ExampleDataset()
+        data_loader = DataLoader(
+            test_dataset,
+            batch_size=1,
+            sampler=None,
+            num_workers=0,
+            shuffle=False)
+        EvalHookCls(data_loader, save_best='auto', rule='unsupport')
+
+    with pytest.raises(ValueError):
+        # key_indicator must be valid when rule_map is None
+        test_dataset = ExampleDataset()
+        data_loader = DataLoader(
+            test_dataset,
+            batch_size=1,
+            sampler=None,
+            num_workers=0,
+            shuffle=False)
+        EvalHookCls(data_loader, save_best='unsupport')
+
+    optimizer_cfg = dict(
+        type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+
+    test_dataset = ExampleDataset()
+    loader = DataLoader(test_dataset, batch_size=1)
+    model = ExampleModel()
+    optimizer = build_optimizer(model, optimizer_cfg)
+
+    data_loader = DataLoader(test_dataset, batch_size=1)
+    eval_hook = EvalHookCls(data_loader, save_best=None)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=tmpdir,
+            logger=logger)
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 1)
+        assert runner.meta is None or 'best_score' not in runner.meta[
+            'hook_msgs']
+        assert runner.meta is None or 'best_ckpt' not in runner.meta[
+            'hook_msgs']
+
+    # when `save_best` is set to 'auto', first metric will be used.
+    loader = DataLoader(EvalDataset(), batch_size=1)
+    model = ExampleModel()
+    data_loader = DataLoader(EvalDataset(), batch_size=1)
+    eval_hook = EvalHookCls(data_loader, interval=1, save_best='auto')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=tmpdir,
+            logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        real_path = osp.join(tmpdir, 'best_mAP_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
+        assert runner.meta['hook_msgs']['best_score'] == 0.7
+
+    loader = DataLoader(EvalDataset(), batch_size=1)
+    model = ExampleModel()
+    data_loader = DataLoader(EvalDataset(), batch_size=1)
+    eval_hook = EvalHookCls(data_loader, interval=1, save_best='mAP')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=tmpdir,
+            logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        real_path = osp.join(tmpdir, 'best_mAP_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
+        assert runner.meta['hook_msgs']['best_score'] == 0.7
+
+    data_loader = DataLoader(EvalDataset(), batch_size=1)
+    eval_hook = EvalHookCls(
+        data_loader, interval=1, save_best='score', rule='greater')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=tmpdir,
+            logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        real_path = osp.join(tmpdir, 'best_score_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
+        assert runner.meta['hook_msgs']['best_score'] == 0.7
+
+    data_loader = DataLoader(EvalDataset(), batch_size=1)
+    eval_hook = EvalHookCls(data_loader, save_best='mAP', rule='less')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=tmpdir,
+            logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        real_path = osp.join(tmpdir, 'best_mAP_epoch_6.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
+        assert runner.meta['hook_msgs']['best_score'] == 0.05
+
+    data_loader = DataLoader(EvalDataset(), batch_size=1)
+    eval_hook = EvalHookCls(data_loader, save_best='mAP')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=tmpdir,
+            logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 2)
+
+        real_path = osp.join(tmpdir, 'best_mAP_epoch_2.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
+        assert runner.meta['hook_msgs']['best_score'] == 0.4
+
+        resume_from = osp.join(tmpdir, 'latest.pth')
+        loader = DataLoader(ExampleDataset(), batch_size=1)
+        eval_hook = EvalHookCls(data_loader, save_best='mAP')
+        runner = EpochBasedRunner(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=tmpdir,
+            logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.resume(resume_from)
+        runner.run([loader], [('train', 1)], 8)
+
+        real_path = osp.join(tmpdir, 'best_mAP_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
+        assert runner.meta['hook_msgs']['best_score'] == 0.7
diff --git a/tests/test_runtime/test_fp16.py b/tests/test_runtime/test_fp16.py
new file mode 100755
index 0000000..e3dd432
--- /dev/null
+++ b/tests/test_runtime/test_fp16.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+from mmcv.runner import auto_fp16, force_fp32
+from mmcv.runner.fp16_utils import cast_tensor_type
+
+
+def test_cast_tensor_type():
+    inputs = torch.FloatTensor([5.])
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, torch.Tensor)
+    assert outputs.dtype == dst_type
+
+    inputs = 'tensor'
+    src_type = str
+    dst_type = str
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, str)
+
+    inputs = np.array([5.])
+    src_type = np.ndarray
+    dst_type = np.ndarray
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, np.ndarray)
+
+    inputs = dict(
+        tensor_a=torch.FloatTensor([1.]), tensor_b=torch.FloatTensor([2.]))
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, dict)
+    assert outputs['tensor_a'].dtype == dst_type
+    assert outputs['tensor_b'].dtype == dst_type
+
+    inputs = [torch.FloatTensor([1.]), torch.FloatTensor([2.])]
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, list)
+    assert outputs[0].dtype == dst_type
+    assert outputs[1].dtype == dst_type
+
+    inputs = 5
+    outputs = cast_tensor_type(inputs, None, None)
+    assert isinstance(outputs, int)
+
+
+def test_auto_fp16():
+
+    with pytest.raises(TypeError):
+        # ExampleObject is not a subclass of nn.Module
+
+        class ExampleObject:
+
+            @auto_fp16()
+            def __call__(self, x):
+                return x
+
+        model = ExampleObject()
+        input_x = torch.ones(1, dtype=torch.float32)
+        model(input_x)
+
+    # apply to all input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16()
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+
+    # apply to specified input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', ))
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.float32
+
+    # apply to optional input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', 'y'))
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    input_z = torch.ones(1, dtype=torch.float32)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+        assert output_z.dtype == torch.float32
+
+    # out_fp32=True
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', 'y'), out_fp32=True)
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.float32)
+    input_z = torch.ones(1, dtype=torch.float32)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+        assert output_z.dtype == torch.float32
+
+
+def test_force_fp32():
+
+    with pytest.raises(TypeError):
+        # ExampleObject is not a subclass of nn.Module
+
+        class ExampleObject:
+
+            @force_fp32()
+            def __call__(self, x):
+                return x
+
+        model = ExampleObject()
+        input_x = torch.ones(1, dtype=torch.float32)
+        model(input_x)
+
+    # apply to all input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32()
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+
+    # apply to specified input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', ))
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.half
+
+    # apply to optional input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', 'y'))
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    input_z = torch.ones(1, dtype=torch.half)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+        assert output_z.dtype == torch.half
+
+    # out_fp16=True
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', 'y'), out_fp16=True)
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.half)
+    input_z = torch.ones(1, dtype=torch.half)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+        assert output_z.dtype == torch.half
diff --git a/tests/test_utils/test_anchor.py b/tests/test_utils/test_anchor.py
new file mode 100755
index 0000000..a9aef72
--- /dev/null
+++ b/tests/test_utils/test_anchor.py
@@ -0,0 +1,769 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""
+CommandLine:
+    pytest tests/test_utils/test_anchor.py
+    xdoctest tests/test_utils/test_anchor.py zero
+
+"""
+import pytest
+import torch
+
+
+def test_standard_points_generator():
+    from mmdet.core.anchor import build_prior_generator
+
+    # teat init
+    anchor_generator_cfg = dict(
+        type='MlvlPointGenerator', strides=[4, 8], offset=0)
+    anchor_generator = build_prior_generator(anchor_generator_cfg)
+    assert anchor_generator is not None
+    assert anchor_generator.num_base_priors == [1, 1]
+    # test_stride
+    from mmdet.core.anchor import MlvlPointGenerator
+
+    # Square strides
+    mlvl_points = MlvlPointGenerator(strides=[4, 10], offset=0)
+    mlvl_points_half_stride_generator = MlvlPointGenerator(
+        strides=[4, 10], offset=0.5)
+    assert mlvl_points.num_levels == 2
+
+    # assert self.num_levels == len(featmap_sizes)
+    with pytest.raises(AssertionError):
+        mlvl_points.grid_priors(featmap_sizes=[(2, 2)], device='cpu')
+    priors = mlvl_points.grid_priors(
+        featmap_sizes=[(2, 2), (4, 8)], device='cpu')
+    priors_with_stride = mlvl_points.grid_priors(
+        featmap_sizes=[(2, 2), (4, 8)], with_stride=True, device='cpu')
+    assert len(priors) == 2
+
+    # assert last dimension is (coord_x, coord_y, stride_w, stride_h).
+    assert priors_with_stride[0].size(1) == 4
+    assert priors_with_stride[0][0][2] == 4
+    assert priors_with_stride[0][0][3] == 4
+    assert priors_with_stride[1][0][2] == 10
+    assert priors_with_stride[1][0][3] == 10
+
+    stride_4_feat_2_2 = priors[0]
+    assert (stride_4_feat_2_2[1] - stride_4_feat_2_2[0]).sum() == 4
+    assert stride_4_feat_2_2.size(0) == 4
+    assert stride_4_feat_2_2.size(1) == 2
+
+    stride_10_feat_4_8 = priors[1]
+    assert (stride_10_feat_4_8[1] - stride_10_feat_4_8[0]).sum() == 10
+    assert stride_10_feat_4_8.size(0) == 4 * 8
+    assert stride_10_feat_4_8.size(1) == 2
+
+    # assert the offset of 0.5 * stride
+    priors_half_offset = mlvl_points_half_stride_generator.grid_priors(
+        featmap_sizes=[(2, 2), (4, 8)], device='cpu')
+
+    assert (priors_half_offset[0][0] - priors[0][0]).sum() == 4 * 0.5 * 2
+    assert (priors_half_offset[1][0] - priors[1][0]).sum() == 10 * 0.5 * 2
+    if torch.cuda.is_available():
+        anchor_generator_cfg = dict(
+            type='MlvlPointGenerator', strides=[4, 8], offset=0)
+        anchor_generator = build_prior_generator(anchor_generator_cfg)
+        assert anchor_generator is not None
+        # Square strides
+        mlvl_points = MlvlPointGenerator(strides=[4, 10], offset=0)
+        mlvl_points_half_stride_generator = MlvlPointGenerator(
+            strides=[4, 10], offset=0.5)
+        assert mlvl_points.num_levels == 2
+
+        # assert self.num_levels == len(featmap_sizes)
+        with pytest.raises(AssertionError):
+            mlvl_points.grid_priors(featmap_sizes=[(2, 2)], device='cuda')
+        priors = mlvl_points.grid_priors(
+            featmap_sizes=[(2, 2), (4, 8)], device='cuda')
+        priors_with_stride = mlvl_points.grid_priors(
+            featmap_sizes=[(2, 2), (4, 8)], with_stride=True, device='cuda')
+        assert len(priors) == 2
+
+        # assert last dimension is (coord_x, coord_y, stride_w, stride_h).
+        assert priors_with_stride[0].size(1) == 4
+        assert priors_with_stride[0][0][2] == 4
+        assert priors_with_stride[0][0][3] == 4
+        assert priors_with_stride[1][0][2] == 10
+        assert priors_with_stride[1][0][3] == 10
+
+        stride_4_feat_2_2 = priors[0]
+        assert (stride_4_feat_2_2[1] - stride_4_feat_2_2[0]).sum() == 4
+        assert stride_4_feat_2_2.size(0) == 4
+        assert stride_4_feat_2_2.size(1) == 2
+
+        stride_10_feat_4_8 = priors[1]
+        assert (stride_10_feat_4_8[1] - stride_10_feat_4_8[0]).sum() == 10
+        assert stride_10_feat_4_8.size(0) == 4 * 8
+        assert stride_10_feat_4_8.size(1) == 2
+
+        # assert the offset of 0.5 * stride
+        priors_half_offset = mlvl_points_half_stride_generator.grid_priors(
+            featmap_sizes=[(2, 2), (4, 8)], device='cuda')
+
+        assert (priors_half_offset[0][0] - priors[0][0]).sum() == 4 * 0.5 * 2
+        assert (priors_half_offset[1][0] - priors[1][0]).sum() == 10 * 0.5 * 2
+
+
+def test_sparse_prior():
+    from mmdet.core.anchor import MlvlPointGenerator
+    mlvl_points = MlvlPointGenerator(strides=[4, 10], offset=0)
+    prior_indexs = torch.Tensor([0, 2, 4, 5, 6, 9]).long()
+
+    featmap_sizes = [(3, 5), (6, 4)]
+    grid_anchors = mlvl_points.grid_priors(
+        featmap_sizes=featmap_sizes, with_stride=False, device='cpu')
+    sparse_prior = mlvl_points.sparse_priors(
+        prior_idxs=prior_indexs,
+        featmap_size=featmap_sizes[0],
+        level_idx=0,
+        device='cpu')
+
+    assert not sparse_prior.is_cuda
+    assert (sparse_prior == grid_anchors[0][prior_indexs]).all()
+    sparse_prior = mlvl_points.sparse_priors(
+        prior_idxs=prior_indexs,
+        featmap_size=featmap_sizes[1],
+        level_idx=1,
+        device='cpu')
+    assert (sparse_prior == grid_anchors[1][prior_indexs]).all()
+
+    from mmdet.core.anchor import AnchorGenerator
+    mlvl_anchors = AnchorGenerator(
+        strides=[16, 32], ratios=[1.], scales=[1.], base_sizes=[4, 8])
+    prior_indexs = torch.Tensor([0, 2, 4, 5, 6, 9]).long()
+
+    featmap_sizes = [(3, 5), (6, 4)]
+    grid_anchors = mlvl_anchors.grid_priors(
+        featmap_sizes=featmap_sizes, device='cpu')
+    sparse_prior = mlvl_anchors.sparse_priors(
+        prior_idxs=prior_indexs,
+        featmap_size=featmap_sizes[0],
+        level_idx=0,
+        device='cpu')
+    assert (sparse_prior == grid_anchors[0][prior_indexs]).all()
+    sparse_prior = mlvl_anchors.sparse_priors(
+        prior_idxs=prior_indexs,
+        featmap_size=featmap_sizes[1],
+        level_idx=1,
+        device='cpu')
+    assert (sparse_prior == grid_anchors[1][prior_indexs]).all()
+
+    # for ssd
+    from mmdet.core.anchor.anchor_generator import SSDAnchorGenerator
+    featmap_sizes = [(38, 38), (19, 19), (10, 10)]
+    anchor_generator = SSDAnchorGenerator(
+        scale_major=False,
+        input_size=300,
+        basesize_ratio_range=(0.15, 0.9),
+        strides=[8, 16, 32],
+        ratios=[[2], [2, 3], [2, 3]])
+    ssd_anchors = anchor_generator.grid_anchors(featmap_sizes, device='cpu')
+    for i in range(len(featmap_sizes)):
+        sparse_ssd_anchors = anchor_generator.sparse_priors(
+            prior_idxs=prior_indexs,
+            level_idx=i,
+            featmap_size=featmap_sizes[i],
+            device='cpu')
+        assert (sparse_ssd_anchors == ssd_anchors[i][prior_indexs]).all()
+
+    # for yolo
+    from mmdet.core.anchor.anchor_generator import YOLOAnchorGenerator
+    featmap_sizes = [(38, 38), (19, 19), (10, 10)]
+    anchor_generator = YOLOAnchorGenerator(
+        strides=[32, 16, 8],
+        base_sizes=[
+            [(116, 90), (156, 198), (373, 326)],
+            [(30, 61), (62, 45), (59, 119)],
+            [(10, 13), (16, 30), (33, 23)],
+        ])
+    yolo_anchors = anchor_generator.grid_anchors(featmap_sizes, device='cpu')
+    for i in range(len(featmap_sizes)):
+        sparse_yolo_anchors = anchor_generator.sparse_priors(
+            prior_idxs=prior_indexs,
+            level_idx=i,
+            featmap_size=featmap_sizes[i],
+            device='cpu')
+        assert (sparse_yolo_anchors == yolo_anchors[i][prior_indexs]).all()
+
+    if torch.cuda.is_available():
+        mlvl_points = MlvlPointGenerator(strides=[4, 10], offset=0)
+        prior_indexs = torch.Tensor([0, 3, 4, 5, 6, 7, 1, 2, 4, 5, 6,
+                                     9]).long().cuda()
+
+        featmap_sizes = [(6, 8), (6, 4)]
+        grid_anchors = mlvl_points.grid_priors(
+            featmap_sizes=featmap_sizes, with_stride=False, device='cuda')
+        sparse_prior = mlvl_points.sparse_priors(
+            prior_idxs=prior_indexs,
+            featmap_size=featmap_sizes[0],
+            level_idx=0,
+            device='cuda')
+        assert (sparse_prior == grid_anchors[0][prior_indexs]).all()
+        sparse_prior = mlvl_points.sparse_priors(
+            prior_idxs=prior_indexs,
+            featmap_size=featmap_sizes[1],
+            level_idx=1,
+            device='cuda')
+        assert (sparse_prior == grid_anchors[1][prior_indexs]).all()
+        assert sparse_prior.is_cuda
+        mlvl_anchors = AnchorGenerator(
+            strides=[16, 32],
+            ratios=[1., 2.5],
+            scales=[1., 5.],
+            base_sizes=[4, 8])
+        prior_indexs = torch.Tensor([4, 5, 6, 7, 0, 2, 50, 4, 5, 6,
+                                     9]).long().cuda()
+
+        featmap_sizes = [(13, 5), (16, 4)]
+        grid_anchors = mlvl_anchors.grid_priors(
+            featmap_sizes=featmap_sizes, device='cuda')
+        sparse_prior = mlvl_anchors.sparse_priors(
+            prior_idxs=prior_indexs,
+            featmap_size=featmap_sizes[0],
+            level_idx=0,
+            device='cuda')
+        assert (sparse_prior == grid_anchors[0][prior_indexs]).all()
+        sparse_prior = mlvl_anchors.sparse_priors(
+            prior_idxs=prior_indexs,
+            featmap_size=featmap_sizes[1],
+            level_idx=1,
+            device='cuda')
+        assert (sparse_prior == grid_anchors[1][prior_indexs]).all()
+
+        # for ssd
+        from mmdet.core.anchor.anchor_generator import SSDAnchorGenerator
+        featmap_sizes = [(38, 38), (19, 19), (10, 10)]
+        anchor_generator = SSDAnchorGenerator(
+            scale_major=False,
+            input_size=300,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32],
+            ratios=[[2], [2, 3], [2, 3]])
+        ssd_anchors = anchor_generator.grid_anchors(
+            featmap_sizes, device='cuda')
+        for i in range(len(featmap_sizes)):
+            sparse_ssd_anchors = anchor_generator.sparse_priors(
+                prior_idxs=prior_indexs,
+                level_idx=i,
+                featmap_size=featmap_sizes[i],
+                device='cuda')
+            assert (sparse_ssd_anchors == ssd_anchors[i][prior_indexs]).all()
+
+        # for yolo
+        from mmdet.core.anchor.anchor_generator import YOLOAnchorGenerator
+        featmap_sizes = [(38, 38), (19, 19), (10, 10)]
+        anchor_generator = YOLOAnchorGenerator(
+            strides=[32, 16, 8],
+            base_sizes=[
+                [(116, 90), (156, 198), (373, 326)],
+                [(30, 61), (62, 45), (59, 119)],
+                [(10, 13), (16, 30), (33, 23)],
+            ])
+        yolo_anchors = anchor_generator.grid_anchors(
+            featmap_sizes, device='cuda')
+        for i in range(len(featmap_sizes)):
+            sparse_yolo_anchors = anchor_generator.sparse_priors(
+                prior_idxs=prior_indexs,
+                level_idx=i,
+                featmap_size=featmap_sizes[i],
+                device='cuda')
+            assert (sparse_yolo_anchors == yolo_anchors[i][prior_indexs]).all()
+
+
+def test_standard_anchor_generator():
+    from mmdet.core.anchor import build_anchor_generator
+    anchor_generator_cfg = dict(
+        type='AnchorGenerator',
+        scales=[8],
+        ratios=[0.5, 1.0, 2.0],
+        strides=[4, 8])
+
+    anchor_generator = build_anchor_generator(anchor_generator_cfg)
+    assert anchor_generator.num_base_priors == \
+           anchor_generator.num_base_anchors
+    assert anchor_generator.num_base_priors == [3, 3]
+    assert anchor_generator is not None
+
+
+def test_strides():
+    from mmdet.core import AnchorGenerator
+
+    # Square strides
+    self = AnchorGenerator([10], [1.], [1.], [10])
+    anchors = self.grid_anchors([(2, 2)], device='cpu')
+
+    expected_anchors = torch.tensor([[-5., -5., 5., 5.], [5., -5., 15., 5.],
+                                     [-5., 5., 5., 15.], [5., 5., 15., 15.]])
+
+    assert torch.equal(anchors[0], expected_anchors)
+
+    # Different strides in x and y direction
+    self = AnchorGenerator([(10, 20)], [1.], [1.], [10])
+    anchors = self.grid_anchors([(2, 2)], device='cpu')
+
+    expected_anchors = torch.tensor([[-5., -5., 5., 5.], [5., -5., 15., 5.],
+                                     [-5., 15., 5., 25.], [5., 15., 15., 25.]])
+
+    assert torch.equal(anchors[0], expected_anchors)
+
+
+def test_ssd_anchor_generator():
+    from mmdet.core.anchor import build_anchor_generator
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+
+    # min_sizes max_sizes must set at the same time
+    with pytest.raises(AssertionError):
+        anchor_generator_cfg = dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            min_sizes=[48, 100, 150, 202, 253, 300],
+            max_sizes=None,
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]])
+        build_anchor_generator(anchor_generator_cfg)
+
+    # length of min_sizes max_sizes must be the same
+    with pytest.raises(AssertionError):
+        anchor_generator_cfg = dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            min_sizes=[48, 100, 150, 202, 253, 300],
+            max_sizes=[100, 150, 202, 253],
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]])
+        build_anchor_generator(anchor_generator_cfg)
+
+    # test setting anchor size manually
+    anchor_generator_cfg = dict(
+        type='SSDAnchorGenerator',
+        scale_major=False,
+        min_sizes=[48, 100, 150, 202, 253, 304],
+        max_sizes=[100, 150, 202, 253, 304, 320],
+        strides=[16, 32, 64, 107, 160, 320],
+        ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]])
+
+    featmap_sizes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
+    anchor_generator = build_anchor_generator(anchor_generator_cfg)
+
+    expected_base_anchors = [
+        torch.Tensor([[-16.0000, -16.0000, 32.0000, 32.0000],
+                      [-26.6410, -26.6410, 42.6410, 42.6410],
+                      [-25.9411, -8.9706, 41.9411, 24.9706],
+                      [-8.9706, -25.9411, 24.9706, 41.9411],
+                      [-33.5692, -5.8564, 49.5692, 21.8564],
+                      [-5.8564, -33.5692, 21.8564, 49.5692]]),
+        torch.Tensor([[-34.0000, -34.0000, 66.0000, 66.0000],
+                      [-45.2372, -45.2372, 77.2372, 77.2372],
+                      [-54.7107, -19.3553, 86.7107, 51.3553],
+                      [-19.3553, -54.7107, 51.3553, 86.7107],
+                      [-70.6025, -12.8675, 102.6025, 44.8675],
+                      [-12.8675, -70.6025, 44.8675, 102.6025]]),
+        torch.Tensor([[-43.0000, -43.0000, 107.0000, 107.0000],
+                      [-55.0345, -55.0345, 119.0345, 119.0345],
+                      [-74.0660, -21.0330, 138.0660, 85.0330],
+                      [-21.0330, -74.0660, 85.0330, 138.0660],
+                      [-97.9038, -11.3013, 161.9038, 75.3013],
+                      [-11.3013, -97.9038, 75.3013, 161.9038]]),
+        torch.Tensor([[-47.5000, -47.5000, 154.5000, 154.5000],
+                      [-59.5332, -59.5332, 166.5332, 166.5332],
+                      [-89.3356, -17.9178, 196.3356, 124.9178],
+                      [-17.9178, -89.3356, 124.9178, 196.3356],
+                      [-121.4371, -4.8124, 228.4371, 111.8124],
+                      [-4.8124, -121.4371, 111.8124, 228.4371]]),
+        torch.Tensor([[-46.5000, -46.5000, 206.5000, 206.5000],
+                      [-58.6651, -58.6651, 218.6651, 218.6651],
+                      [-98.8980, -9.4490, 258.8980, 169.4490],
+                      [-9.4490, -98.8980, 169.4490, 258.8980],
+                      [-139.1044, 6.9652, 299.1044, 153.0348],
+                      [6.9652, -139.1044, 153.0348, 299.1044]]),
+        torch.Tensor([[8.0000, 8.0000, 312.0000, 312.0000],
+                      [4.0513, 4.0513, 315.9487, 315.9487],
+                      [-54.9605, 52.5198, 374.9604, 267.4802],
+                      [52.5198, -54.9605, 267.4802, 374.9604],
+                      [-103.2717, 72.2428, 423.2717, 247.7572],
+                      [72.2428, -103.2717, 247.7572, 423.2717]])
+    ]
+
+    base_anchors = anchor_generator.base_anchors
+    for i, base_anchor in enumerate(base_anchors):
+        assert base_anchor.allclose(expected_base_anchors[i])
+
+    # check valid flags
+    expected_valid_pixels = [2400, 600, 150, 54, 24, 6]
+    multi_level_valid_flags = anchor_generator.valid_flags(
+        featmap_sizes, (320, 320), device)
+    for i, single_level_valid_flag in enumerate(multi_level_valid_flags):
+        assert single_level_valid_flag.sum() == expected_valid_pixels[i]
+
+    # check number of base anchors for each level
+    assert anchor_generator.num_base_anchors == [6, 6, 6, 6, 6, 6]
+
+    # check anchor generation
+    anchors = anchor_generator.grid_anchors(featmap_sizes, device)
+    assert len(anchors) == 6
+
+    # test vgg ssd anchor setting
+    anchor_generator_cfg = dict(
+        type='SSDAnchorGenerator',
+        scale_major=False,
+        input_size=300,
+        basesize_ratio_range=(0.15, 0.9),
+        strides=[8, 16, 32, 64, 100, 300],
+        ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]])
+
+    featmap_sizes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
+    anchor_generator = build_anchor_generator(anchor_generator_cfg)
+
+    # check base anchors
+    expected_base_anchors = [
+        torch.Tensor([[-6.5000, -6.5000, 14.5000, 14.5000],
+                      [-11.3704, -11.3704, 19.3704, 19.3704],
+                      [-10.8492, -3.4246, 18.8492, 11.4246],
+                      [-3.4246, -10.8492, 11.4246, 18.8492]]),
+        torch.Tensor([[-14.5000, -14.5000, 30.5000, 30.5000],
+                      [-25.3729, -25.3729, 41.3729, 41.3729],
+                      [-23.8198, -7.9099, 39.8198, 23.9099],
+                      [-7.9099, -23.8198, 23.9099, 39.8198],
+                      [-30.9711, -4.9904, 46.9711, 20.9904],
+                      [-4.9904, -30.9711, 20.9904, 46.9711]]),
+        torch.Tensor([[-33.5000, -33.5000, 65.5000, 65.5000],
+                      [-45.5366, -45.5366, 77.5366, 77.5366],
+                      [-54.0036, -19.0018, 86.0036, 51.0018],
+                      [-19.0018, -54.0036, 51.0018, 86.0036],
+                      [-69.7365, -12.5788, 101.7365, 44.5788],
+                      [-12.5788, -69.7365, 44.5788, 101.7365]]),
+        torch.Tensor([[-44.5000, -44.5000, 108.5000, 108.5000],
+                      [-56.9817, -56.9817, 120.9817, 120.9817],
+                      [-76.1873, -22.0937, 140.1873, 86.0937],
+                      [-22.0937, -76.1873, 86.0937, 140.1873],
+                      [-100.5019, -12.1673, 164.5019, 76.1673],
+                      [-12.1673, -100.5019, 76.1673, 164.5019]]),
+        torch.Tensor([[-53.5000, -53.5000, 153.5000, 153.5000],
+                      [-66.2185, -66.2185, 166.2185, 166.2185],
+                      [-96.3711, -23.1855, 196.3711, 123.1855],
+                      [-23.1855, -96.3711, 123.1855, 196.3711]]),
+        torch.Tensor([[19.5000, 19.5000, 280.5000, 280.5000],
+                      [6.6342, 6.6342, 293.3658, 293.3658],
+                      [-34.5549, 57.7226, 334.5549, 242.2774],
+                      [57.7226, -34.5549, 242.2774, 334.5549]]),
+    ]
+    base_anchors = anchor_generator.base_anchors
+    for i, base_anchor in enumerate(base_anchors):
+        assert base_anchor.allclose(expected_base_anchors[i])
+
+    # check valid flags
+    expected_valid_pixels = [5776, 2166, 600, 150, 36, 4]
+    multi_level_valid_flags = anchor_generator.valid_flags(
+        featmap_sizes, (300, 300), device)
+    for i, single_level_valid_flag in enumerate(multi_level_valid_flags):
+        assert single_level_valid_flag.sum() == expected_valid_pixels[i]
+
+    # check number of base anchors for each level
+    assert anchor_generator.num_base_anchors == [4, 6, 6, 6, 4, 4]
+
+    # check anchor generation
+    anchors = anchor_generator.grid_anchors(featmap_sizes, device)
+    assert len(anchors) == 6
+
+
+def test_anchor_generator_with_tuples():
+    from mmdet.core.anchor import build_anchor_generator
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+
+    anchor_generator_cfg = dict(
+        type='SSDAnchorGenerator',
+        scale_major=False,
+        input_size=300,
+        basesize_ratio_range=(0.15, 0.9),
+        strides=[8, 16, 32, 64, 100, 300],
+        ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]])
+
+    featmap_sizes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
+    anchor_generator = build_anchor_generator(anchor_generator_cfg)
+    anchors = anchor_generator.grid_anchors(featmap_sizes, device)
+
+    anchor_generator_cfg_tuples = dict(
+        type='SSDAnchorGenerator',
+        scale_major=False,
+        input_size=300,
+        basesize_ratio_range=(0.15, 0.9),
+        strides=[(8, 8), (16, 16), (32, 32), (64, 64), (100, 100), (300, 300)],
+        ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]])
+
+    anchor_generator_tuples = build_anchor_generator(
+        anchor_generator_cfg_tuples)
+    anchors_tuples = anchor_generator_tuples.grid_anchors(
+        featmap_sizes, device)
+    for anchor, anchor_tuples in zip(anchors, anchors_tuples):
+        assert torch.equal(anchor, anchor_tuples)
+
+
+def test_yolo_anchor_generator():
+    from mmdet.core.anchor import build_anchor_generator
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+
+    anchor_generator_cfg = dict(
+        type='YOLOAnchorGenerator',
+        strides=[32, 16, 8],
+        base_sizes=[
+            [(116, 90), (156, 198), (373, 326)],
+            [(30, 61), (62, 45), (59, 119)],
+            [(10, 13), (16, 30), (33, 23)],
+        ])
+
+    featmap_sizes = [(14, 18), (28, 36), (56, 72)]
+    anchor_generator = build_anchor_generator(anchor_generator_cfg)
+
+    # check base anchors
+    expected_base_anchors = [
+        torch.Tensor([[-42.0000, -29.0000, 74.0000, 61.0000],
+                      [-62.0000, -83.0000, 94.0000, 115.0000],
+                      [-170.5000, -147.0000, 202.5000, 179.0000]]),
+        torch.Tensor([[-7.0000, -22.5000, 23.0000, 38.5000],
+                      [-23.0000, -14.5000, 39.0000, 30.5000],
+                      [-21.5000, -51.5000, 37.5000, 67.5000]]),
+        torch.Tensor([[-1.0000, -2.5000, 9.0000, 10.5000],
+                      [-4.0000, -11.0000, 12.0000, 19.0000],
+                      [-12.5000, -7.5000, 20.5000, 15.5000]])
+    ]
+    base_anchors = anchor_generator.base_anchors
+    for i, base_anchor in enumerate(base_anchors):
+        assert base_anchor.allclose(expected_base_anchors[i])
+
+    # check number of base anchors for each level
+    assert anchor_generator.num_base_anchors == [3, 3, 3]
+
+    # check anchor generation
+    anchors = anchor_generator.grid_anchors(featmap_sizes, device)
+    assert len(anchors) == 3
+
+
+def test_retina_anchor():
+    from mmdet.models import build_head
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+
+    # head configs modified from
+    # configs/nas_fpn/retinanet_r50_fpn_crop640_50e.py
+    bbox_head = dict(
+        type='RetinaSepBNHead',
+        num_classes=4,
+        num_ins=5,
+        in_channels=4,
+        stacked_convs=1,
+        feat_channels=4,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]))
+
+    retina_head = build_head(bbox_head)
+    assert retina_head.anchor_generator is not None
+
+    # use the featmap sizes in NASFPN setting to test retina head
+    featmap_sizes = [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
+    # check base anchors
+    expected_base_anchors = [
+        torch.Tensor([[-22.6274, -11.3137, 22.6274, 11.3137],
+                      [-28.5088, -14.2544, 28.5088, 14.2544],
+                      [-35.9188, -17.9594, 35.9188, 17.9594],
+                      [-16.0000, -16.0000, 16.0000, 16.0000],
+                      [-20.1587, -20.1587, 20.1587, 20.1587],
+                      [-25.3984, -25.3984, 25.3984, 25.3984],
+                      [-11.3137, -22.6274, 11.3137, 22.6274],
+                      [-14.2544, -28.5088, 14.2544, 28.5088],
+                      [-17.9594, -35.9188, 17.9594, 35.9188]]),
+        torch.Tensor([[-45.2548, -22.6274, 45.2548, 22.6274],
+                      [-57.0175, -28.5088, 57.0175, 28.5088],
+                      [-71.8376, -35.9188, 71.8376, 35.9188],
+                      [-32.0000, -32.0000, 32.0000, 32.0000],
+                      [-40.3175, -40.3175, 40.3175, 40.3175],
+                      [-50.7968, -50.7968, 50.7968, 50.7968],
+                      [-22.6274, -45.2548, 22.6274, 45.2548],
+                      [-28.5088, -57.0175, 28.5088, 57.0175],
+                      [-35.9188, -71.8376, 35.9188, 71.8376]]),
+        torch.Tensor([[-90.5097, -45.2548, 90.5097, 45.2548],
+                      [-114.0350, -57.0175, 114.0350, 57.0175],
+                      [-143.6751, -71.8376, 143.6751, 71.8376],
+                      [-64.0000, -64.0000, 64.0000, 64.0000],
+                      [-80.6349, -80.6349, 80.6349, 80.6349],
+                      [-101.5937, -101.5937, 101.5937, 101.5937],
+                      [-45.2548, -90.5097, 45.2548, 90.5097],
+                      [-57.0175, -114.0350, 57.0175, 114.0350],
+                      [-71.8376, -143.6751, 71.8376, 143.6751]]),
+        torch.Tensor([[-181.0193, -90.5097, 181.0193, 90.5097],
+                      [-228.0701, -114.0350, 228.0701, 114.0350],
+                      [-287.3503, -143.6751, 287.3503, 143.6751],
+                      [-128.0000, -128.0000, 128.0000, 128.0000],
+                      [-161.2699, -161.2699, 161.2699, 161.2699],
+                      [-203.1873, -203.1873, 203.1873, 203.1873],
+                      [-90.5097, -181.0193, 90.5097, 181.0193],
+                      [-114.0350, -228.0701, 114.0350, 228.0701],
+                      [-143.6751, -287.3503, 143.6751, 287.3503]]),
+        torch.Tensor([[-362.0387, -181.0193, 362.0387, 181.0193],
+                      [-456.1401, -228.0701, 456.1401, 228.0701],
+                      [-574.7006, -287.3503, 574.7006, 287.3503],
+                      [-256.0000, -256.0000, 256.0000, 256.0000],
+                      [-322.5398, -322.5398, 322.5398, 322.5398],
+                      [-406.3747, -406.3747, 406.3747, 406.3747],
+                      [-181.0193, -362.0387, 181.0193, 362.0387],
+                      [-228.0701, -456.1401, 228.0701, 456.1401],
+                      [-287.3503, -574.7006, 287.3503, 574.7006]])
+    ]
+    base_anchors = retina_head.anchor_generator.base_anchors
+    for i, base_anchor in enumerate(base_anchors):
+        assert base_anchor.allclose(expected_base_anchors[i])
+
+    # check valid flags
+    expected_valid_pixels = [57600, 14400, 3600, 900, 225]
+    multi_level_valid_flags = retina_head.anchor_generator.valid_flags(
+        featmap_sizes, (640, 640), device)
+    for i, single_level_valid_flag in enumerate(multi_level_valid_flags):
+        assert single_level_valid_flag.sum() == expected_valid_pixels[i]
+
+    # check number of base anchors for each level
+    assert retina_head.anchor_generator.num_base_anchors == [9, 9, 9, 9, 9]
+
+    # check anchor generation
+    anchors = retina_head.anchor_generator.grid_anchors(featmap_sizes, device)
+    assert len(anchors) == 5
+
+
+def test_guided_anchor():
+    from mmdet.models import build_head
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+    # head configs modified from
+    # configs/guided_anchoring/ga_retinanet_r50_fpn_1x_coco.py
+    bbox_head = dict(
+        type='GARetinaHead',
+        num_classes=8,
+        in_channels=4,
+        stacked_convs=1,
+        feat_channels=4,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]))
+
+    ga_retina_head = build_head(bbox_head)
+    assert ga_retina_head.approx_anchor_generator is not None
+
+    # use the featmap sizes in NASFPN setting to test ga_retina_head
+    featmap_sizes = [(100, 152), (50, 76), (25, 38), (13, 19), (7, 10)]
+    # check base anchors
+    expected_approxs = [
+        torch.Tensor([[-22.6274, -11.3137, 22.6274, 11.3137],
+                      [-28.5088, -14.2544, 28.5088, 14.2544],
+                      [-35.9188, -17.9594, 35.9188, 17.9594],
+                      [-16.0000, -16.0000, 16.0000, 16.0000],
+                      [-20.1587, -20.1587, 20.1587, 20.1587],
+                      [-25.3984, -25.3984, 25.3984, 25.3984],
+                      [-11.3137, -22.6274, 11.3137, 22.6274],
+                      [-14.2544, -28.5088, 14.2544, 28.5088],
+                      [-17.9594, -35.9188, 17.9594, 35.9188]]),
+        torch.Tensor([[-45.2548, -22.6274, 45.2548, 22.6274],
+                      [-57.0175, -28.5088, 57.0175, 28.5088],
+                      [-71.8376, -35.9188, 71.8376, 35.9188],
+                      [-32.0000, -32.0000, 32.0000, 32.0000],
+                      [-40.3175, -40.3175, 40.3175, 40.3175],
+                      [-50.7968, -50.7968, 50.7968, 50.7968],
+                      [-22.6274, -45.2548, 22.6274, 45.2548],
+                      [-28.5088, -57.0175, 28.5088, 57.0175],
+                      [-35.9188, -71.8376, 35.9188, 71.8376]]),
+        torch.Tensor([[-90.5097, -45.2548, 90.5097, 45.2548],
+                      [-114.0350, -57.0175, 114.0350, 57.0175],
+                      [-143.6751, -71.8376, 143.6751, 71.8376],
+                      [-64.0000, -64.0000, 64.0000, 64.0000],
+                      [-80.6349, -80.6349, 80.6349, 80.6349],
+                      [-101.5937, -101.5937, 101.5937, 101.5937],
+                      [-45.2548, -90.5097, 45.2548, 90.5097],
+                      [-57.0175, -114.0350, 57.0175, 114.0350],
+                      [-71.8376, -143.6751, 71.8376, 143.6751]]),
+        torch.Tensor([[-181.0193, -90.5097, 181.0193, 90.5097],
+                      [-228.0701, -114.0350, 228.0701, 114.0350],
+                      [-287.3503, -143.6751, 287.3503, 143.6751],
+                      [-128.0000, -128.0000, 128.0000, 128.0000],
+                      [-161.2699, -161.2699, 161.2699, 161.2699],
+                      [-203.1873, -203.1873, 203.1873, 203.1873],
+                      [-90.5097, -181.0193, 90.5097, 181.0193],
+                      [-114.0350, -228.0701, 114.0350, 228.0701],
+                      [-143.6751, -287.3503, 143.6751, 287.3503]]),
+        torch.Tensor([[-362.0387, -181.0193, 362.0387, 181.0193],
+                      [-456.1401, -228.0701, 456.1401, 228.0701],
+                      [-574.7006, -287.3503, 574.7006, 287.3503],
+                      [-256.0000, -256.0000, 256.0000, 256.0000],
+                      [-322.5398, -322.5398, 322.5398, 322.5398],
+                      [-406.3747, -406.3747, 406.3747, 406.3747],
+                      [-181.0193, -362.0387, 181.0193, 362.0387],
+                      [-228.0701, -456.1401, 228.0701, 456.1401],
+                      [-287.3503, -574.7006, 287.3503, 574.7006]])
+    ]
+    approxs = ga_retina_head.approx_anchor_generator.base_anchors
+    for i, base_anchor in enumerate(approxs):
+        assert base_anchor.allclose(expected_approxs[i])
+
+    # check valid flags
+    expected_valid_pixels = [136800, 34200, 8550, 2223, 630]
+    multi_level_valid_flags = ga_retina_head.approx_anchor_generator \
+        .valid_flags(featmap_sizes, (800, 1216), device)
+    for i, single_level_valid_flag in enumerate(multi_level_valid_flags):
+        assert single_level_valid_flag.sum() == expected_valid_pixels[i]
+
+    # check number of base anchors for each level
+    assert ga_retina_head.approx_anchor_generator.num_base_anchors == [
+        9, 9, 9, 9, 9
+    ]
+
+    # check approx generation
+    squares = ga_retina_head.square_anchor_generator.grid_anchors(
+        featmap_sizes, device)
+    assert len(squares) == 5
+
+    expected_squares = [
+        torch.Tensor([[-16., -16., 16., 16.]]),
+        torch.Tensor([[-32., -32., 32., 32]]),
+        torch.Tensor([[-64., -64., 64., 64.]]),
+        torch.Tensor([[-128., -128., 128., 128.]]),
+        torch.Tensor([[-256., -256., 256., 256.]])
+    ]
+    squares = ga_retina_head.square_anchor_generator.base_anchors
+    for i, base_anchor in enumerate(squares):
+        assert base_anchor.allclose(expected_squares[i])
+
+    # square_anchor_generator does not check valid flags
+    # check number of base anchors for each level
+    assert (ga_retina_head.square_anchor_generator.num_base_anchors == [
+        1, 1, 1, 1, 1
+    ])
+
+    # check square generation
+    anchors = ga_retina_head.square_anchor_generator.grid_anchors(
+        featmap_sizes, device)
+    assert len(anchors) == 5
diff --git a/tests/test_utils/test_assigner.py b/tests/test_utils/test_assigner.py
new file mode 100755
index 0000000..7cdb08b
--- /dev/null
+++ b/tests/test_utils/test_assigner.py
@@ -0,0 +1,700 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests the Assigner objects.
+
+CommandLine:
+    pytest tests/test_utils/test_assigner.py
+    xdoctest tests/test_utils/test_assigner.py zero
+"""
+import pytest
+import torch
+
+from mmdet.core.bbox.assigners import (ApproxMaxIoUAssigner,
+                                       AscendMaxIoUAssigner,
+                                       CenterRegionAssigner, HungarianAssigner,
+                                       MaskHungarianAssigner, MaxIoUAssigner,
+                                       PointAssigner, SimOTAAssigner,
+                                       TaskAlignedAssigner, UniformAssigner)
+
+
+def test_max_iou_assigner():
+    self = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([2, 3])
+    assign_result = self.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 4
+    assert len(assign_result.labels) == 4
+
+    expected_gt_inds = torch.LongTensor([1, 0, 2, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_max_iou_assigner_with_ignore():
+    self = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+        ignore_wrt_candidates=False,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [30, 32, 40, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_bboxes_ignore = torch.Tensor([
+        [30, 30, 40, 40],
+    ])
+    assign_result = self.assign(
+        bboxes, gt_bboxes, gt_bboxes_ignore=gt_bboxes_ignore)
+
+    expected_gt_inds = torch.LongTensor([1, 0, 2, -1])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_max_iou_assigner_with_empty_gt():
+    """Test corner case where an image might have no true detections."""
+    self = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.empty(0, 4)
+    assign_result = self.assign(bboxes, gt_bboxes)
+
+    expected_gt_inds = torch.LongTensor([0, 0, 0, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_max_iou_assigner_with_empty_boxes():
+    """Test corner case where a network might predict no boxes."""
+    self = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.empty((0, 4))
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([2, 3])
+
+    # Test with gt_labels
+    assign_result = self.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 0
+    assert tuple(assign_result.labels.shape) == (0, )
+
+    # Test without gt_labels
+    assign_result = self.assign(bboxes, gt_bboxes, gt_labels=None)
+    assert len(assign_result.gt_inds) == 0
+    assert assign_result.labels is None
+
+
+def test_max_iou_assigner_with_empty_boxes_and_ignore():
+    """Test corner case where a network might predict no boxes and
+    ignore_iof_thr is on."""
+    self = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        ignore_iof_thr=0.5,
+    )
+    bboxes = torch.empty((0, 4))
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_bboxes_ignore = torch.Tensor([
+        [30, 30, 40, 40],
+    ])
+    gt_labels = torch.LongTensor([2, 3])
+
+    # Test with gt_labels
+    assign_result = self.assign(
+        bboxes,
+        gt_bboxes,
+        gt_labels=gt_labels,
+        gt_bboxes_ignore=gt_bboxes_ignore)
+    assert len(assign_result.gt_inds) == 0
+    assert tuple(assign_result.labels.shape) == (0, )
+
+    # Test without gt_labels
+    assign_result = self.assign(
+        bboxes, gt_bboxes, gt_labels=None, gt_bboxes_ignore=gt_bboxes_ignore)
+    assert len(assign_result.gt_inds) == 0
+    assert assign_result.labels is None
+
+
+def test_max_iou_assigner_with_empty_boxes_and_gt():
+    """Test corner case where a network might predict no boxes and no gt."""
+    self = MaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.empty((0, 4))
+    gt_bboxes = torch.empty((0, 4))
+    assign_result = self.assign(bboxes, gt_bboxes)
+    assert len(assign_result.gt_inds) == 0
+
+
+def test_point_assigner():
+    self = PointAssigner()
+    points = torch.FloatTensor([  # [x, y, stride]
+        [0, 0, 1],
+        [10, 10, 1],
+        [5, 5, 1],
+        [32, 32, 1],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    assign_result = self.assign(points, gt_bboxes)
+    expected_gt_inds = torch.LongTensor([1, 2, 1, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_point_assigner_with_empty_gt():
+    """Test corner case where an image might have no true detections."""
+    self = PointAssigner()
+    points = torch.FloatTensor([  # [x, y, stride]
+        [0, 0, 1],
+        [10, 10, 1],
+        [5, 5, 1],
+        [32, 32, 1],
+    ])
+    gt_bboxes = torch.FloatTensor([])
+    assign_result = self.assign(points, gt_bboxes)
+
+    expected_gt_inds = torch.LongTensor([0, 0, 0, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_point_assigner_with_empty_boxes_and_gt():
+    """Test corner case where an image might predict no points and no gt."""
+    self = PointAssigner()
+    points = torch.FloatTensor([])
+    gt_bboxes = torch.FloatTensor([])
+    assign_result = self.assign(points, gt_bboxes)
+    assert len(assign_result.gt_inds) == 0
+
+
+def test_approx_iou_assigner():
+    self = ApproxMaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    approxs_per_octave = 1
+    approxs = bboxes
+    squares = bboxes
+    assign_result = self.assign(approxs, squares, approxs_per_octave,
+                                gt_bboxes)
+
+    expected_gt_inds = torch.LongTensor([1, 0, 2, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_approx_iou_assigner_with_empty_gt():
+    """Test corner case where an image might have no true detections."""
+    self = ApproxMaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([])
+    approxs_per_octave = 1
+    approxs = bboxes
+    squares = bboxes
+    assign_result = self.assign(approxs, squares, approxs_per_octave,
+                                gt_bboxes)
+
+    expected_gt_inds = torch.LongTensor([0, 0, 0, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_approx_iou_assigner_with_empty_boxes():
+    """Test corner case where an network might predict no boxes."""
+    self = ApproxMaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.empty((0, 4))
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    approxs_per_octave = 1
+    approxs = bboxes
+    squares = bboxes
+    assign_result = self.assign(approxs, squares, approxs_per_octave,
+                                gt_bboxes)
+    assert len(assign_result.gt_inds) == 0
+
+
+def test_approx_iou_assigner_with_empty_boxes_and_gt():
+    """Test corner case where an network might predict no boxes and no gt."""
+    self = ApproxMaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    bboxes = torch.empty((0, 4))
+    gt_bboxes = torch.empty((0, 4))
+    approxs_per_octave = 1
+    approxs = bboxes
+    squares = bboxes
+    assign_result = self.assign(approxs, squares, approxs_per_octave,
+                                gt_bboxes)
+    assert len(assign_result.gt_inds) == 0
+
+
+def test_random_assign_result():
+    """Test random instantiation of assign result to catch corner cases."""
+    from mmdet.core.bbox.assigners.assign_result import AssignResult
+    AssignResult.random()
+
+    AssignResult.random(num_gts=0, num_preds=0)
+    AssignResult.random(num_gts=0, num_preds=3)
+    AssignResult.random(num_gts=3, num_preds=3)
+    AssignResult.random(num_gts=0, num_preds=3)
+    AssignResult.random(num_gts=7, num_preds=7)
+    AssignResult.random(num_gts=7, num_preds=64)
+    AssignResult.random(num_gts=24, num_preds=3)
+
+
+def test_center_region_assigner():
+    self = CenterRegionAssigner(pos_scale=0.3, neg_scale=1)
+    bboxes = torch.FloatTensor([[0, 0, 10, 10], [10, 10, 20, 20], [8, 8, 9,
+                                                                   9]])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 11, 11],  # match bboxes[0]
+        [10, 10, 20, 20],  # match bboxes[1]
+        [4.5, 4.5, 5.5, 5.5],  # match bboxes[0] but area is too small
+        [0, 0, 10, 10],  # match bboxes[1] and has a smaller area than gt[0]
+    ])
+    gt_labels = torch.LongTensor([2, 3, 4, 5])
+    assign_result = self.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 3
+    assert len(assign_result.labels) == 3
+    expected_gt_inds = torch.LongTensor([4, 2, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+    shadowed_labels = assign_result.get_extra_property('shadowed_labels')
+    # [8, 8, 9, 9] in the shadowed region of [0, 0, 11, 11] (label: 2)
+    assert torch.any(shadowed_labels == torch.LongTensor([[2, 2]]))
+    # [8, 8, 9, 9] in the shadowed region of [0, 0, 10, 10] (label: 5)
+    assert torch.any(shadowed_labels == torch.LongTensor([[2, 5]]))
+    # [0, 0, 10, 10] is already assigned to [4.5, 4.5, 5.5, 5.5].
+    #   Therefore, [0, 0, 11, 11] (label: 2) is shadowed
+    assert torch.any(shadowed_labels == torch.LongTensor([[0, 2]]))
+
+
+def test_center_region_assigner_with_ignore():
+    self = CenterRegionAssigner(
+        pos_scale=0.5,
+        neg_scale=1,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],  # match bboxes[0]
+        [10, 10, 20, 20],  # match bboxes[1]
+    ])
+    gt_bboxes_ignore = torch.FloatTensor([
+        [0, 0, 10, 10],  # match bboxes[0]
+    ])
+    gt_labels = torch.LongTensor([1, 2])
+    assign_result = self.assign(
+        bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_bboxes_ignore,
+        gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 2
+    assert len(assign_result.labels) == 2
+
+    expected_gt_inds = torch.LongTensor([-1, 2])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_center_region_assigner_with_empty_bboxes():
+    self = CenterRegionAssigner(
+        pos_scale=0.5,
+        neg_scale=1,
+    )
+    bboxes = torch.empty((0, 4)).float()
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],  # match bboxes[0]
+        [10, 10, 20, 20],  # match bboxes[1]
+    ])
+    gt_labels = torch.LongTensor([1, 2])
+    assign_result = self.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+    assert assign_result.gt_inds is None or assign_result.gt_inds.numel() == 0
+    assert assign_result.labels is None or assign_result.labels.numel() == 0
+
+
+def test_center_region_assigner_with_empty_gts():
+    self = CenterRegionAssigner(
+        pos_scale=0.5,
+        neg_scale=1,
+    )
+    bboxes = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+    ])
+    gt_bboxes = torch.empty((0, 4)).float()
+    gt_labels = torch.empty((0, )).long()
+    assign_result = self.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 2
+    expected_gt_inds = torch.LongTensor([0, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_hungarian_match_assigner():
+    self = HungarianAssigner()
+    assert self.iou_cost.iou_mode == 'giou'
+
+    # test no gt bboxes
+    bbox_pred = torch.rand((10, 4))
+    cls_pred = torch.rand((10, 81))
+    gt_bboxes = torch.empty((0, 4)).float()
+    gt_labels = torch.empty((0, )).long()
+    img_meta = dict(img_shape=(10, 8, 3))
+    assign_result = self.assign(bbox_pred, cls_pred, gt_bboxes, gt_labels,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds == 0)
+    assert torch.all(assign_result.labels == -1)
+
+    # test with gt bboxes
+    gt_bboxes = torch.FloatTensor([[0, 0, 5, 7], [3, 5, 7, 8]])
+    gt_labels = torch.LongTensor([1, 20])
+    assign_result = self.assign(bbox_pred, cls_pred, gt_bboxes, gt_labels,
+                                img_meta)
+
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_bboxes.size(0)
+    assert (assign_result.labels > -1).sum() == gt_bboxes.size(0)
+
+    # test iou mode
+    self = HungarianAssigner(
+        iou_cost=dict(type='IoUCost', iou_mode='iou', weight=1.0))
+    assert self.iou_cost.iou_mode == 'iou'
+    assign_result = self.assign(bbox_pred, cls_pred, gt_bboxes, gt_labels,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_bboxes.size(0)
+    assert (assign_result.labels > -1).sum() == gt_bboxes.size(0)
+
+    # test focal loss mode
+    self = HungarianAssigner(
+        iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0),
+        cls_cost=dict(type='FocalLossCost', weight=1.))
+    assert self.iou_cost.iou_mode == 'giou'
+    assign_result = self.assign(bbox_pred, cls_pred, gt_bboxes, gt_labels,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_bboxes.size(0)
+    assert (assign_result.labels > -1).sum() == gt_bboxes.size(0)
+
+
+def test_uniform_assigner():
+    self = UniformAssigner(0.15, 0.7, 1)
+    pred_bbox = torch.FloatTensor([
+        [1, 1, 12, 8],
+        [4, 4, 20, 20],
+        [1, 5, 15, 15],
+        [30, 5, 32, 42],
+    ])
+    anchor = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([2, 3])
+    assign_result = self.assign(
+        pred_bbox, anchor, gt_bboxes, gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 4
+    assert len(assign_result.labels) == 4
+
+    expected_gt_inds = torch.LongTensor([-1, 0, 2, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_uniform_assigner_with_empty_gt():
+    """Test corner case where an image might have no true detections."""
+    self = UniformAssigner(0.15, 0.7, 1)
+    pred_bbox = torch.FloatTensor([
+        [1, 1, 12, 8],
+        [4, 4, 20, 20],
+        [1, 5, 15, 15],
+        [30, 5, 32, 42],
+    ])
+    anchor = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.empty(0, 4)
+    assign_result = self.assign(pred_bbox, anchor, gt_bboxes)
+
+    expected_gt_inds = torch.LongTensor([0, 0, 0, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_uniform_assigner_with_empty_boxes():
+    """Test corner case where a network might predict no boxes."""
+    self = UniformAssigner(0.15, 0.7, 1)
+    pred_bbox = torch.empty((0, 4))
+    anchor = torch.empty((0, 4))
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([2, 3])
+
+    # Test with gt_labels
+    assign_result = self.assign(
+        pred_bbox, anchor, gt_bboxes, gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 0
+    assert tuple(assign_result.labels.shape) == (0, )
+
+    # Test without gt_labels
+    assign_result = self.assign(pred_bbox, anchor, gt_bboxes, gt_labels=None)
+    assert len(assign_result.gt_inds) == 0
+
+
+def test_sim_ota_assigner():
+    self = SimOTAAssigner(
+        center_radius=2.5, candidate_topk=1, iou_weight=3.0, cls_weight=1.0)
+    pred_scores = torch.FloatTensor([[0.2], [0.8]])
+    priors = torch.Tensor([[0, 12, 23, 34], [4, 5, 6, 7]])
+    decoded_bboxes = torch.Tensor([[[30, 40, 50, 60]], [[4, 5, 6, 7]]])
+    gt_bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]])
+    gt_labels = torch.LongTensor([2])
+    assign_result = self.assign(pred_scores, priors, decoded_bboxes, gt_bboxes,
+                                gt_labels)
+
+    expected_gt_inds = torch.LongTensor([0, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_task_aligned_assigner():
+    with pytest.raises(AssertionError):
+        TaskAlignedAssigner(topk=0)
+
+    self = TaskAlignedAssigner(topk=13)
+    pred_score = torch.FloatTensor([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4],
+                                    [0.4, 0.5]])
+    pred_bbox = torch.FloatTensor([
+        [1, 1, 12, 8],
+        [4, 4, 20, 20],
+        [1, 5, 15, 15],
+        [30, 5, 32, 42],
+    ])
+    anchor = torch.FloatTensor([
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ])
+    gt_bboxes = torch.FloatTensor([
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ])
+    gt_labels = torch.LongTensor([0, 1])
+    assign_result = self.assign(
+        pred_score,
+        pred_bbox,
+        anchor,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels)
+    assert len(assign_result.gt_inds) == 4
+    assert len(assign_result.labels) == 4
+
+    # test empty gt
+    gt_bboxes = torch.empty(0, 4)
+    gt_labels = torch.empty(0, 2)
+    assign_result = self.assign(
+        pred_score, pred_bbox, anchor, gt_bboxes=gt_bboxes)
+    expected_gt_inds = torch.LongTensor([0, 0, 0, 0])
+    assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_mask_hungarian_match_assigner():
+    # test no gt masks
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=1.0),
+        mask_cost=dict(type='FocalLossCost', weight=20.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=1.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    cls_pred = torch.rand((10, 133))
+    mask_pred = torch.rand((10, 50, 50))
+
+    gt_labels = torch.empty((0, )).long()
+    gt_masks = torch.empty((0, 50, 50)).float()
+    img_meta = None
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds == 0)
+    assert torch.all(assign_result.labels == -1)
+
+    # test with gt masks of naive_dice is True
+    gt_labels = torch.LongTensor([10, 100])
+    gt_masks = torch.zeros((2, 50, 50)).long()
+    gt_masks[0, :25] = 1
+    gt_masks[0, 25:] = 1
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with cls mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=1.0),
+        mask_cost=dict(type='FocalLossCost', weight=0.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask focal mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(type='FocalLossCost', weight=1.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask dice mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(type='FocalLossCost', weight=0.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=1.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask dice mode that naive_dice is False
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(type='FocalLossCost', weight=0.0, binary_input=True),
+        dice_cost=dict(
+            type='DiceCost',
+            weight=1.0,
+            pred_act=True,
+            eps=1.0,
+            naive_dice=False))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask bce mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(
+            type='CrossEntropyLossCost', weight=1.0, use_sigmoid=True),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with ce mode of CrossEntropyLossCost which is not supported yet
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(
+            type='CrossEntropyLossCost', weight=1.0, use_sigmoid=False),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    with pytest.raises(AssertionError):
+        self = MaskHungarianAssigner(**assigner_cfg)
+
+
+def test_ascend_max_iou_assigner():
+    self = AscendMaxIoUAssigner(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+    )
+    batch_bboxes = torch.FloatTensor([[
+        [0, 0, 10, 10],
+        [10, 10, 20, 20],
+        [5, 5, 15, 15],
+        [32, 32, 38, 42],
+    ]])
+    batch_gt_bboxes = torch.FloatTensor([[
+        [0, 0, 10, 9],
+        [0, 10, 10, 19],
+    ]])
+    batch_gt_labels = torch.LongTensor([[2, 3]])
+    batch_bboxes_ignore_mask = torch.IntTensor([[1, 1, 1, 1]])
+    assign_result = self.assign(
+        batch_bboxes,
+        batch_gt_bboxes,
+        batch_gt_labels=batch_gt_labels,
+        batch_bboxes_ignore_mask=batch_bboxes_ignore_mask)
+
+    expected_batch_pos_mask = torch.IntTensor([1, 0, 1, 0])
+    expected_batch_anchor_gt_indes = torch.IntTensor([0, 0, 1, 0])
+    expected_batch_anchor_gt_labels = torch.IntTensor([2, 0, 3, 0])
+
+    assert torch.all(assign_result.batch_pos_mask == expected_batch_pos_mask)
+    assert torch.all(
+        assign_result.batch_anchor_gt_indes *
+        assign_result.batch_pos_mask == expected_batch_anchor_gt_indes)
+    assert torch.all(
+        assign_result.batch_anchor_gt_labels *
+        assign_result.batch_pos_mask == expected_batch_anchor_gt_labels)
diff --git a/tests/test_utils/test_coder.py b/tests/test_utils/test_coder.py
new file mode 100755
index 0000000..f23649d
--- /dev/null
+++ b/tests/test_utils/test_coder.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet.core.bbox.coder import (DeltaXYWHBBoxCoder, DistancePointBBoxCoder,
+                                   TBLRBBoxCoder, YOLOBBoxCoder)
+
+
+def test_yolo_bbox_coder():
+    coder = YOLOBBoxCoder()
+    bboxes = torch.Tensor([[-42., -29., 74., 61.], [-10., -29., 106., 61.],
+                           [22., -29., 138., 61.], [54., -29., 170., 61.]])
+    pred_bboxes = torch.Tensor([[0.4709, 0.6152, 0.1690, -0.4056],
+                                [0.5399, 0.6653, 0.1162, -0.4162],
+                                [0.4654, 0.6618, 0.1548, -0.4301],
+                                [0.4786, 0.6197, 0.1896, -0.4479]])
+    grid_size = 32
+    expected_decode_bboxes = torch.Tensor(
+        [[-53.6102, -10.3096, 83.7478, 49.6824],
+         [-15.8700, -8.3901, 114.4236, 50.9693],
+         [11.1822, -8.0924, 146.6034, 50.4476],
+         [41.2068, -8.9232, 181.4236, 48.5840]])
+    assert expected_decode_bboxes.allclose(
+        coder.decode(bboxes, pred_bboxes, grid_size))
+
+
+def test_delta_bbox_coder():
+    coder = DeltaXYWHBBoxCoder()
+
+    rois = torch.Tensor([[0., 0., 1., 1.], [0., 0., 1., 1.], [0., 0., 1., 1.],
+                         [5., 5., 5., 5.]])
+    deltas = torch.Tensor([[0., 0., 0., 0.], [1., 1., 1., 1.],
+                           [0., 0., 2., -1.], [0.7, -1.9, -0.5, 0.3]])
+    expected_decode_bboxes = torch.Tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                                           [0.1409, 0.1409, 2.8591, 2.8591],
+                                           [0.0000, 0.3161, 4.1945, 0.6839],
+                                           [5.0000, 5.0000, 5.0000, 5.0000]])
+
+    out = coder.decode(rois, deltas, max_shape=(32, 32))
+    assert expected_decode_bboxes.allclose(out, atol=1e-04)
+    out = coder.decode(rois, deltas, max_shape=torch.Tensor((32, 32)))
+    assert expected_decode_bboxes.allclose(out, atol=1e-04)
+
+    batch_rois = rois.unsqueeze(0).repeat(2, 1, 1)
+    batch_deltas = deltas.unsqueeze(0).repeat(2, 1, 1)
+    batch_out = coder.decode(batch_rois, batch_deltas, max_shape=(32, 32))[0]
+    assert out.allclose(batch_out)
+    batch_out = coder.decode(
+        batch_rois, batch_deltas, max_shape=[(32, 32), (32, 32)])[0]
+    assert out.allclose(batch_out)
+
+    # test max_shape is not equal to batch
+    with pytest.raises(AssertionError):
+        coder.decode(
+            batch_rois, batch_deltas, max_shape=[(32, 32), (32, 32), (32, 32)])
+
+    rois = torch.zeros((0, 4))
+    deltas = torch.zeros((0, 4))
+    out = coder.decode(rois, deltas, max_shape=(32, 32))
+    assert rois.shape == out.shape
+
+    # test add_ctr_clamp
+    coder = DeltaXYWHBBoxCoder(add_ctr_clamp=True, ctr_clamp=2)
+
+    rois = torch.Tensor([[0., 0., 6., 6.], [0., 0., 1., 1.], [0., 0., 1., 1.],
+                         [5., 5., 5., 5.]])
+    deltas = torch.Tensor([[1., 1., 2., 2.], [1., 1., 1., 1.],
+                           [0., 0., 2., -1.], [0.7, -1.9, -0.5, 0.3]])
+    expected_decode_bboxes = torch.Tensor([[0.0000, 0.0000, 27.1672, 27.1672],
+                                           [0.1409, 0.1409, 2.8591, 2.8591],
+                                           [0.0000, 0.3161, 4.1945, 0.6839],
+                                           [5.0000, 5.0000, 5.0000, 5.0000]])
+
+    out = coder.decode(rois, deltas, max_shape=(32, 32))
+    assert expected_decode_bboxes.allclose(out, atol=1e-04)
+
+
+def test_tblr_bbox_coder():
+    coder = TBLRBBoxCoder(normalizer=15.)
+
+    rois = torch.Tensor([[0., 0., 1., 1.], [0., 0., 1., 1.], [0., 0., 1., 1.],
+                         [5., 5., 5., 5.]])
+    deltas = torch.Tensor([[0., 0., 0., 0.], [1., 1., 1., 1.],
+                           [0., 0., 2., -1.], [0.7, -1.9, -0.5, 0.3]])
+    expected_decode_bboxes = torch.Tensor([[0.5000, 0.5000, 0.5000, 0.5000],
+                                           [0.0000, 0.0000, 12.0000, 13.0000],
+                                           [0.0000, 0.5000, 0.0000, 0.5000],
+                                           [5.0000, 5.0000, 5.0000, 5.0000]])
+
+    out = coder.decode(rois, deltas, max_shape=(13, 12))
+    assert expected_decode_bboxes.allclose(out)
+    out = coder.decode(rois, deltas, max_shape=torch.Tensor((13, 12)))
+    assert expected_decode_bboxes.allclose(out)
+
+    batch_rois = rois.unsqueeze(0).repeat(2, 1, 1)
+    batch_deltas = deltas.unsqueeze(0).repeat(2, 1, 1)
+    batch_out = coder.decode(batch_rois, batch_deltas, max_shape=(13, 12))[0]
+    assert out.allclose(batch_out)
+    batch_out = coder.decode(
+        batch_rois, batch_deltas, max_shape=[(13, 12), (13, 12)])[0]
+    assert out.allclose(batch_out)
+
+    # test max_shape is not equal to batch
+    with pytest.raises(AssertionError):
+        coder.decode(batch_rois, batch_deltas, max_shape=[(13, 12)])
+
+    rois = torch.zeros((0, 4))
+    deltas = torch.zeros((0, 4))
+    out = coder.decode(rois, deltas, max_shape=(32, 32))
+    assert rois.shape == out.shape
+
+
+def test_distance_point_bbox_coder():
+    coder = DistancePointBBoxCoder()
+
+    points = torch.Tensor([[74., 61.], [-29., 106.], [138., 61.], [29., 170.]])
+    gt_bboxes = torch.Tensor([[74., 61., 75., 62.], [0., 104., 0., 112.],
+                              [100., 90., 100., 120.], [0., 120., 100., 120.]])
+    expected_distance = torch.Tensor([[0., 0., 1., 1.], [0., 2., 29., 6.],
+                                      [38., 0., 0., 50.], [29., 50., 50., 0.]])
+    out_distance = coder.encode(points, gt_bboxes, max_dis=50, eps=0)
+    assert expected_distance.allclose(out_distance)
+
+    distance = torch.Tensor([[0., 0, 1., 1.], [1., 2., 10., 6.],
+                             [22., -29., 138., 61.], [54., -29., 170., 61.]])
+    out_bbox = coder.decode(points, distance, max_shape=(120, 100))
+    assert gt_bboxes.allclose(out_bbox)
diff --git a/tests/test_utils/test_compat_config.py b/tests/test_utils/test_compat_config.py
new file mode 100755
index 0000000..5f8178a
--- /dev/null
+++ b/tests/test_utils/test_compat_config.py
@@ -0,0 +1,115 @@
+import pytest
+from mmcv import ConfigDict
+
+from mmdet.utils.compat_config import (compat_imgs_per_gpu, compat_loader_args,
+                                       compat_runner_args)
+
+
+def test_compat_runner_args():
+    cfg = ConfigDict(dict(total_epochs=12))
+    with pytest.warns(None) as record:
+        cfg = compat_runner_args(cfg)
+    assert len(record) == 1
+    assert 'runner' in record.list[0].message.args[0]
+    assert 'runner' in cfg
+    assert cfg.runner.type == 'EpochBasedRunner'
+    assert cfg.runner.max_epochs == cfg.total_epochs
+
+
+def test_compat_loader_args():
+    cfg = ConfigDict(dict(data=dict(val=dict(), test=dict(), train=dict())))
+    cfg = compat_loader_args(cfg)
+    # auto fill loader args
+    assert 'val_dataloader' in cfg.data
+    assert 'train_dataloader' in cfg.data
+    assert 'test_dataloader' in cfg.data
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                train=dict())))
+
+    cfg = compat_loader_args(cfg)
+
+    assert cfg.data.train_dataloader.workers_per_gpu == 1
+    assert cfg.data.train_dataloader.samples_per_gpu == 1
+    assert cfg.data.train_dataloader.persistent_workers
+    assert cfg.data.val_dataloader.workers_per_gpu == 1
+    assert cfg.data.val_dataloader.samples_per_gpu == 3
+    assert cfg.data.test_dataloader.workers_per_gpu == 1
+    assert cfg.data.test_dataloader.samples_per_gpu == 2
+
+    # test test is a list
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=[dict(samples_per_gpu=2),
+                      dict(samples_per_gpu=3)],
+                train=dict())))
+
+    cfg = compat_loader_args(cfg)
+    assert cfg.data.test_dataloader.samples_per_gpu == 3
+
+    # assert can not set args at the same time
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                train=dict(),
+                train_dataloader=dict(samples_per_gpu=2))))
+    # samples_per_gpu can not be set in `train_dataloader`
+    # and data field at the same time
+    with pytest.raises(AssertionError):
+        compat_loader_args(cfg)
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                train=dict(),
+                val_dataloader=dict(samples_per_gpu=2))))
+    # samples_per_gpu can not be set in `val_dataloader`
+    # and data field at the same time
+    with pytest.raises(AssertionError):
+        compat_loader_args(cfg)
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                test_dataloader=dict(samples_per_gpu=2))))
+    # samples_per_gpu can not be set in `test_dataloader`
+    # and data field at the same time
+    with pytest.raises(AssertionError):
+        compat_loader_args(cfg)
+
+
+def test_compat_imgs_per_gpu():
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                imgs_per_gpu=1,
+                samples_per_gpu=2,
+                val=dict(),
+                test=dict(),
+                train=dict())))
+    cfg = compat_imgs_per_gpu(cfg)
+    assert cfg.data.samples_per_gpu == cfg.data.imgs_per_gpu
diff --git a/tests/test_utils/test_general_data.py b/tests/test_utils/test_general_data.py
new file mode 100755
index 0000000..c5525fd
--- /dev/null
+++ b/tests/test_utils/test_general_data.py
@@ -0,0 +1,591 @@
+import copy
+
+import numpy as np
+import pytest
+import torch
+
+from mmdet.core import GeneralData, InstanceData
+
+
+def _equal(a, b):
+    if isinstance(a, (torch.Tensor, np.ndarray)):
+        return (a == b).all()
+    else:
+        return a == b
+
+
+def test_general_data():
+
+    # test init
+    meta_info = dict(
+        img_size=[256, 256],
+        path='dadfaff',
+        scale_factor=np.array([1.5, 1.5]),
+        img_shape=torch.rand(4))
+
+    data = dict(
+        bboxes=torch.rand(4, 4),
+        labels=torch.rand(4),
+        masks=np.random.rand(4, 2, 2))
+
+    instance_data = GeneralData(meta_info=meta_info)
+    assert 'img_size' in instance_data
+    assert instance_data.img_size == [256, 256]
+    assert instance_data['img_size'] == [256, 256]
+    assert 'path' in instance_data
+    assert instance_data.path == 'dadfaff'
+
+    # test nice_repr
+    repr_instance_data = instance_data.new(data=data)
+    nice_repr = str(repr_instance_data)
+    for line in nice_repr.split('\n'):
+        if 'masks' in line:
+            assert 'shape' in line
+            assert '(4, 2, 2)' in line
+        if 'bboxes' in line:
+            assert 'shape' in line
+            assert 'torch.Size([4, 4])' in line
+        if 'path' in line:
+            assert 'dadfaff' in line
+        if 'scale_factor' in line:
+            assert '[1.5 1.5]' in line
+
+    instance_data = GeneralData(
+        meta_info=meta_info, data=dict(bboxes=torch.rand(5)))
+    assert 'bboxes' in instance_data
+    assert len(instance_data.bboxes) == 5
+
+    # data should be a dict
+    with pytest.raises(AssertionError):
+        GeneralData(data=1)
+
+    # test set data
+    instance_data = GeneralData()
+    instance_data.set_data(data)
+    assert 'bboxes' in instance_data
+    assert len(instance_data.bboxes) == 4
+    assert 'masks' in instance_data
+    assert len(instance_data.masks) == 4
+    # data should be a dict
+    with pytest.raises(AssertionError):
+        instance_data.set_data(data=1)
+
+    # test set_meta
+    instance_data = GeneralData()
+    instance_data.set_meta_info(meta_info)
+    assert 'img_size' in instance_data
+    assert instance_data.img_size == [256, 256]
+    assert instance_data['img_size'] == [256, 256]
+    assert 'path' in instance_data
+    assert instance_data.path == 'dadfaff'
+    # can skip same value when overwrite
+    instance_data.set_meta_info(meta_info)
+
+    # meta should be a dict
+    with pytest.raises(AssertionError):
+        instance_data.set_meta_info(meta_info='fjhka')
+
+    # attribute in `_meta_info_field` is immutable once initialized
+    instance_data.set_meta_info(meta_info)
+    # meta should be immutable
+    with pytest.raises(KeyError):
+        instance_data.set_meta_info(dict(img_size=[254, 251]))
+    with pytest.raises(KeyError):
+        duplicate_meta_info = copy.deepcopy(meta_info)
+        duplicate_meta_info['path'] = 'dada'
+        instance_data.set_meta_info(duplicate_meta_info)
+    with pytest.raises(KeyError):
+        duplicate_meta_info = copy.deepcopy(meta_info)
+        duplicate_meta_info['scale_factor'] = np.array([1.5, 1.6])
+        instance_data.set_meta_info(duplicate_meta_info)
+
+    # test new_instance_data
+    instance_data = GeneralData(meta_info)
+    new_instance_data = instance_data.new()
+    for k, v in instance_data.meta_info_items():
+        assert k in new_instance_data
+        _equal(v, new_instance_data[k])
+
+    instance_data = GeneralData(meta_info, data=data)
+    temp_meta = copy.deepcopy(meta_info)
+    temp_data = copy.deepcopy(data)
+    temp_data['time'] = '12212'
+    temp_meta['img_norm'] = np.random.random(3)
+
+    new_instance_data = instance_data.new(meta_info=temp_meta, data=temp_data)
+    for k, v in new_instance_data.meta_info_items():
+        if k in instance_data:
+            _equal(v, instance_data[k])
+        else:
+            assert _equal(v, temp_meta[k])
+            assert k == 'img_norm'
+
+    for k, v in new_instance_data.items():
+        if k in instance_data:
+            _equal(v, instance_data[k])
+        else:
+            assert k == 'time'
+            assert _equal(v, temp_data[k])
+
+    # test keys
+    instance_data = GeneralData(meta_info, data=dict(bboxes=10))
+    assert 'bboxes' in instance_data.keys()
+    instance_data.b = 10
+    assert 'b' in instance_data
+
+    # test meta keys
+    instance_data = GeneralData(meta_info, data=dict(bboxes=10))
+    assert 'path' in instance_data.meta_info_keys()
+    assert len(instance_data.meta_info_keys()) == len(meta_info)
+    instance_data.set_meta_info(dict(workdir='fafaf'))
+    assert 'workdir' in instance_data
+    assert len(instance_data.meta_info_keys()) == len(meta_info) + 1
+
+    # test values
+    instance_data = GeneralData(meta_info, data=dict(bboxes=10))
+    assert 10 in instance_data.values()
+    assert len(instance_data.values()) == 1
+
+    # test meta values
+    instance_data = GeneralData(meta_info, data=dict(bboxes=10))
+    # torch 1.3 eq() can not compare str and tensor
+    from mmdet import digit_version
+    if digit_version(torch.__version__) >= [1, 4]:
+        assert 'dadfaff' in instance_data.meta_info_values()
+    assert len(instance_data.meta_info_values()) == len(meta_info)
+
+    # test items
+    instance_data = GeneralData(data=data)
+    for k, v in instance_data.items():
+        assert k in data
+        assert _equal(v, data[k])
+
+    # test meta_info_items
+    instance_data = GeneralData(meta_info=meta_info)
+    for k, v in instance_data.meta_info_items():
+        assert k in meta_info
+        assert _equal(v, meta_info[k])
+
+    # test __setattr__
+    new_instance_data = GeneralData(data=data)
+    new_instance_data.mask = torch.rand(3, 4, 5)
+    new_instance_data.bboxes = torch.rand(2, 4)
+    assert 'mask' in new_instance_data
+    assert len(new_instance_data.mask) == 3
+    assert len(new_instance_data.bboxes) == 2
+
+    # test instance_data_field has been updated
+    assert 'mask' in new_instance_data._data_fields
+    assert 'bboxes' in new_instance_data._data_fields
+
+    for k in data:
+        assert k in new_instance_data._data_fields
+
+    # '_meta_info_field', '_data_fields' is immutable.
+    with pytest.raises(AttributeError):
+        new_instance_data._data_fields = None
+    with pytest.raises(AttributeError):
+        new_instance_data._meta_info_fields = None
+    with pytest.raises(AttributeError):
+        del new_instance_data._data_fields
+    with pytest.raises(AttributeError):
+        del new_instance_data._meta_info_fields
+
+    # key in _meta_info_field is immutable
+    new_instance_data.set_meta_info(meta_info)
+    with pytest.raises(KeyError):
+        del new_instance_data.img_size
+    with pytest.raises(KeyError):
+        del new_instance_data.scale_factor
+    for k in new_instance_data.meta_info_keys():
+        with pytest.raises(AttributeError):
+            new_instance_data[k] = None
+
+    # test __delattr__
+    # test key can be removed in instance_data_field
+    assert 'mask' in new_instance_data._data_fields
+    assert 'mask' in new_instance_data.keys()
+    assert 'mask' in new_instance_data
+    assert hasattr(new_instance_data, 'mask')
+    del new_instance_data.mask
+    assert 'mask' not in new_instance_data.keys()
+    assert 'mask' not in new_instance_data
+    assert 'mask' not in new_instance_data._data_fields
+    assert not hasattr(new_instance_data, 'mask')
+
+    # tset __delitem__
+    new_instance_data.mask = torch.rand(1, 2, 3)
+    assert 'mask' in new_instance_data._data_fields
+    assert 'mask' in new_instance_data
+    assert hasattr(new_instance_data, 'mask')
+    del new_instance_data['mask']
+    assert 'mask' not in new_instance_data
+    assert 'mask' not in new_instance_data._data_fields
+    assert 'mask' not in new_instance_data
+    assert not hasattr(new_instance_data, 'mask')
+
+    # test __setitem__
+    new_instance_data['mask'] = torch.rand(1, 2, 3)
+    assert 'mask' in new_instance_data._data_fields
+    assert 'mask' in new_instance_data.keys()
+    assert hasattr(new_instance_data, 'mask')
+
+    # test data_fields has been updated
+    assert 'mask' in new_instance_data.keys()
+    assert 'mask' in new_instance_data._data_fields
+
+    # '_meta_info_field', '_data_fields' is immutable.
+    with pytest.raises(AttributeError):
+        del new_instance_data['_data_fields']
+    with pytest.raises(AttributeError):
+        del new_instance_data['_meta_info_field']
+
+    #  test __getitem__
+    new_instance_data.mask is new_instance_data['mask']
+
+    # test get
+    assert new_instance_data.get('mask') is new_instance_data.mask
+    assert new_instance_data.get('none_attribute', None) is None
+    assert new_instance_data.get('none_attribute', 1) == 1
+
+    # test pop
+    mask = new_instance_data.mask
+    assert new_instance_data.pop('mask') is mask
+    assert new_instance_data.pop('mask', None) is None
+    assert new_instance_data.pop('mask', 1) == 1
+
+    # '_meta_info_field', '_data_fields' is immutable.
+    with pytest.raises(KeyError):
+        new_instance_data.pop('_data_fields')
+    with pytest.raises(KeyError):
+        new_instance_data.pop('_meta_info_field')
+    # attribute in `_meta_info_field` is immutable
+    with pytest.raises(KeyError):
+        new_instance_data.pop('img_size')
+    # test pop attribute in instance_data_filed
+    new_instance_data['mask'] = torch.rand(1, 2, 3)
+    new_instance_data.pop('mask')
+    # test data_field has been updated
+    assert 'mask' not in new_instance_data
+    assert 'mask' not in new_instance_data._data_fields
+    assert 'mask' not in new_instance_data
+
+    # test_keys
+    new_instance_data.mask = torch.ones(1, 2, 3)
+    'mask' in new_instance_data.keys()
+    has_flag = False
+    for key in new_instance_data.keys():
+        if key == 'mask':
+            has_flag = True
+    assert has_flag
+
+    # test values
+    assert len(list(new_instance_data.keys())) == len(
+        list(new_instance_data.values()))
+    mask = new_instance_data.mask
+    has_flag = False
+    for value in new_instance_data.values():
+        if value is mask:
+            has_flag = True
+    assert has_flag
+
+    # test items
+    assert len(list(new_instance_data.keys())) == len(
+        list(new_instance_data.items()))
+    mask = new_instance_data.mask
+    has_flag = False
+    for key, value in new_instance_data.items():
+        if value is mask:
+            assert key == 'mask'
+            has_flag = True
+    assert has_flag
+
+    # test device
+    new_instance_data = GeneralData()
+    if torch.cuda.is_available():
+        newnew_instance_data = new_instance_data.new()
+        devices = ('cpu', 'cuda')
+        for i in range(10):
+            device = devices[i % 2]
+            newnew_instance_data[f'{i}'] = torch.rand(1, 2, 3, device=device)
+        newnew_instance_data = newnew_instance_data.cpu()
+        for value in newnew_instance_data.values():
+            assert not value.is_cuda
+        newnew_instance_data = new_instance_data.new()
+        devices = ('cuda', 'cpu')
+        for i in range(10):
+            device = devices[i % 2]
+            newnew_instance_data[f'{i}'] = torch.rand(1, 2, 3, device=device)
+        newnew_instance_data = newnew_instance_data.cuda()
+        for value in newnew_instance_data.values():
+            assert value.is_cuda
+    # test to
+    double_instance_data = instance_data.new()
+    double_instance_data.long = torch.LongTensor(1, 2, 3, 4)
+    double_instance_data.bool = torch.BoolTensor(1, 2, 3, 4)
+    double_instance_data = instance_data.to(torch.double)
+    for k, v in double_instance_data.items():
+        if isinstance(v, torch.Tensor):
+            assert v.dtype is torch.double
+
+    # test .cpu() .cuda()
+    if torch.cuda.is_available():
+        cpu_instance_data = double_instance_data.new()
+        cpu_instance_data.mask = torch.rand(1)
+        cuda_tensor = torch.rand(1, 2, 3).cuda()
+        cuda_instance_data = cpu_instance_data.to(cuda_tensor.device)
+        for value in cuda_instance_data.values():
+            assert value.is_cuda
+        cpu_instance_data = cuda_instance_data.cpu()
+        for value in cpu_instance_data.values():
+            assert not value.is_cuda
+        cuda_instance_data = cpu_instance_data.cuda()
+        for value in cuda_instance_data.values():
+            assert value.is_cuda
+
+    # test detach
+    grad_instance_data = double_instance_data.new()
+    grad_instance_data.mask = torch.rand(2, requires_grad=True)
+    grad_instance_data.mask_1 = torch.rand(2, requires_grad=True)
+    detach_instance_data = grad_instance_data.detach()
+    for value in detach_instance_data.values():
+        assert not value.requires_grad
+
+    # test numpy
+    tensor_instance_data = double_instance_data.new()
+    tensor_instance_data.mask = torch.rand(2, requires_grad=True)
+    tensor_instance_data.mask_1 = torch.rand(2, requires_grad=True)
+    numpy_instance_data = tensor_instance_data.numpy()
+    for value in numpy_instance_data.values():
+        assert isinstance(value, np.ndarray)
+    if torch.cuda.is_available():
+        tensor_instance_data = double_instance_data.new()
+        tensor_instance_data.mask = torch.rand(2)
+        tensor_instance_data.mask_1 = torch.rand(2)
+        tensor_instance_data = tensor_instance_data.cuda()
+        numpy_instance_data = tensor_instance_data.numpy()
+        for value in numpy_instance_data.values():
+            assert isinstance(value, np.ndarray)
+
+    instance_data['_c'] = 10000
+    instance_data.get('dad', None) is None
+    assert hasattr(instance_data, '_c')
+    del instance_data['_c']
+    assert not hasattr(instance_data, '_c')
+    instance_data.a = 1000
+    instance_data['a'] = 2000
+    assert instance_data['a'] == 2000
+    assert instance_data.a == 2000
+    assert instance_data.get('a') == instance_data['a'] == instance_data.a
+    instance_data._meta = 1000
+    assert '_meta' in instance_data.keys()
+    if torch.cuda.is_available():
+        instance_data.bbox = torch.ones(2, 3, 4, 5).cuda()
+        instance_data.score = torch.ones(2, 3, 4, 4)
+    else:
+        instance_data.bbox = torch.ones(2, 3, 4, 5)
+
+    assert len(instance_data.new().keys()) == 0
+    with pytest.raises(AttributeError):
+        instance_data.img_size = 100
+
+    for k, v in instance_data.items():
+        if k == 'bbox':
+            assert isinstance(v, torch.Tensor)
+    assert 'a' in instance_data
+    instance_data.pop('a')
+    assert 'a' not in instance_data
+
+    cpu_instance_data = instance_data.cpu()
+    for k, v in cpu_instance_data.items():
+        if isinstance(v, torch.Tensor):
+            assert not v.is_cuda
+
+    assert isinstance(cpu_instance_data.numpy().bbox, np.ndarray)
+
+    if torch.cuda.is_available():
+        cuda_resutls = instance_data.cuda()
+        for k, v in cuda_resutls.items():
+            if isinstance(v, torch.Tensor):
+                assert v.is_cuda
+
+
+def test_instance_data():
+    meta_info = dict(
+        img_size=(256, 256),
+        path='dadfaff',
+        scale_factor=np.array([1.5, 1.5, 1, 1]))
+
+    data = dict(
+        bboxes=torch.rand(4, 4),
+        masks=torch.rand(4, 2, 2),
+        labels=np.random.rand(4),
+        size=[(i, i) for i in range(4)])
+
+    # test init
+    instance_data = InstanceData(meta_info)
+    assert 'path' in instance_data
+    instance_data = InstanceData(meta_info, data=data)
+    assert len(instance_data) == 4
+    instance_data.set_data(data)
+    assert len(instance_data) == 4
+
+    meta_info = copy.deepcopy(meta_info)
+    meta_info['img_name'] = 'flag'
+
+    # test newinstance_data
+    new_instance_data = instance_data.new(meta_info=meta_info)
+    for k, v in new_instance_data.meta_info_items():
+        if k in instance_data:
+            _equal(v, instance_data[k])
+        else:
+            assert _equal(v, meta_info[k])
+            assert k == 'img_name'
+    # meta info is immutable
+    with pytest.raises(KeyError):
+        meta_info = copy.deepcopy(meta_info)
+        meta_info['path'] = 'fdasfdsd'
+        instance_data.new(meta_info=meta_info)
+
+    # data fields should have same length
+    with pytest.raises(AssertionError):
+        temp_data = copy.deepcopy(data)
+        temp_data['bboxes'] = torch.rand(5, 4)
+        instance_data.new(data=temp_data)
+
+    temp_data = copy.deepcopy(data)
+    temp_data['scores'] = torch.rand(4)
+    new_instance_data = instance_data.new(data=temp_data)
+    for k, v in new_instance_data.items():
+        if k in instance_data:
+            _equal(v, instance_data[k])
+        else:
+            assert k == 'scores'
+            assert _equal(v, temp_data[k])
+
+    instance_data = instance_data.new()
+
+    # test __setattr__
+    # '_meta_info_field', '_data_fields' is immutable.
+    with pytest.raises(AttributeError):
+        instance_data._data_fields = dict()
+    with pytest.raises(AttributeError):
+        instance_data._data_fields = dict()
+
+    # all attribute in instance_data_field should be
+    # (torch.Tensor, np.ndarray, list))
+    with pytest.raises(AssertionError):
+        instance_data.a = 1000
+
+    # instance_data field should has same length
+    new_instance_data = instance_data.new()
+    new_instance_data.det_bbox = torch.rand(100, 4)
+    new_instance_data.det_label = torch.arange(100)
+    with pytest.raises(AssertionError):
+        new_instance_data.scores = torch.rand(101, 1)
+    new_instance_data.none = [None] * 100
+    with pytest.raises(AssertionError):
+        new_instance_data.scores = [None] * 101
+    new_instance_data.numpy_det = np.random.random([100, 1])
+    with pytest.raises(AssertionError):
+        new_instance_data.scores = np.random.random([101, 1])
+
+    # isinstance(str, slice, int, torch.LongTensor, torch.BoolTensor)
+    item = torch.Tensor([1, 2, 3, 4])
+    with pytest.raises(AssertionError):
+        new_instance_data[item]
+    len(new_instance_data[item.long()]) == 1
+
+    # when input is a bool tensor, The shape of
+    # the input at index 0 should equal to
+    # the value length in instance_data_field
+    with pytest.raises(AssertionError):
+        new_instance_data[item.bool()]
+
+    for i in range(len(new_instance_data)):
+        assert new_instance_data[i].det_label == i
+        assert len(new_instance_data[i]) == 1
+
+    # assert the index should in 0 ~ len(instance_data) -1
+    with pytest.raises(IndexError):
+        new_instance_data[101]
+
+    # assert the index should not be an empty tensor
+    new_new_instance_data = new_instance_data.new()
+    with pytest.raises(AssertionError):
+        new_new_instance_data[0]
+
+    # test str
+    with pytest.raises(AssertionError):
+        instance_data.img_size_dummmy = meta_info['img_size']
+
+    # test slice
+    ten_ressults = new_instance_data[:10]
+    len(ten_ressults) == 10
+    for v in ten_ressults.values():
+        assert len(v) == 10
+
+    # test Longtensor
+    long_tensor = torch.randint(100, (50, ))
+    long_index_instance_data = new_instance_data[long_tensor]
+    assert len(long_index_instance_data) == len(long_tensor)
+    for key, value in long_index_instance_data.items():
+        if not isinstance(value, list):
+            assert (long_index_instance_data[key] == new_instance_data[key]
+                    [long_tensor]).all()
+        else:
+            len(long_tensor) == len(value)
+
+    # test bool tensor
+    bool_tensor = torch.rand(100) > 0.5
+    bool_index_instance_data = new_instance_data[bool_tensor]
+    assert len(bool_index_instance_data) == bool_tensor.sum()
+    for key, value in bool_index_instance_data.items():
+        if not isinstance(value, list):
+            assert (bool_index_instance_data[key] == new_instance_data[key]
+                    [bool_tensor]).all()
+        else:
+            assert len(value) == bool_tensor.sum()
+
+    num_instance = 1000
+    instance_data_list = []
+
+    # assert len(instance_lists) > 0
+    with pytest.raises(AssertionError):
+        instance_data.cat(instance_data_list)
+
+    for _ in range(2):
+        instance_data['bbox'] = torch.rand(num_instance, 4)
+        instance_data['label'] = torch.rand(num_instance, 1)
+        instance_data['mask'] = torch.rand(num_instance, 224, 224)
+        instance_data['instances_infos'] = [1] * num_instance
+        instance_data['cpu_bbox'] = np.random.random((num_instance, 4))
+        if torch.cuda.is_available():
+            instance_data.cuda_tensor = torch.rand(num_instance).cuda()
+            assert instance_data.cuda_tensor.is_cuda
+            cuda_instance_data = instance_data.cuda()
+            assert cuda_instance_data.cuda_tensor.is_cuda
+
+        assert len(instance_data[0]) == 1
+        with pytest.raises(IndexError):
+            return instance_data[num_instance + 1]
+        with pytest.raises(AssertionError):
+            instance_data.centerness = torch.rand(num_instance + 1, 1)
+
+        mask_tensor = torch.rand(num_instance) > 0.5
+        length = mask_tensor.sum()
+        assert len(instance_data[mask_tensor]) == length
+
+        index_tensor = torch.LongTensor([1, 5, 8, 110, 399])
+        length = len(index_tensor)
+
+        assert len(instance_data[index_tensor]) == length
+
+        instance_data_list.append(instance_data)
+
+    cat_resutls = InstanceData.cat(instance_data_list)
+    assert len(cat_resutls) == num_instance * 2
+
+    instances = InstanceData(data=dict(bboxes=torch.rand(4, 4)))
+    # cat only single instance
+    assert len(InstanceData.cat([instances])) == 4
diff --git a/tests/test_utils/test_hook.py b/tests/test_utils/test_hook.py
new file mode 100755
index 0000000..49cd5ca
--- /dev/null
+++ b/tests/test_utils/test_hook.py
@@ -0,0 +1,415 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import shutil
+import sys
+import tempfile
+from unittest.mock import MagicMock, Mock, call, patch
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+from mmcv.runner import (CheckpointHook, IterTimerHook, PaviLoggerHook,
+                         build_runner)
+from torch.nn.init import constant_
+from torch.utils.data import DataLoader, Dataset
+
+from mmdet.core.hook import ExpMomentumEMAHook, YOLOXLrUpdaterHook
+from mmdet.core.hook.sync_norm_hook import SyncNormHook
+from mmdet.core.hook.sync_random_size_hook import SyncRandomSizeHook
+
+
+def _build_demo_runner_without_hook(runner_type='EpochBasedRunner',
+                                    max_epochs=1,
+                                    max_iters=None,
+                                    multi_optimziers=False):
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(2, 1)
+            self.conv = nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            return self.linear(x)
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+    model = Model()
+
+    if multi_optimziers:
+        optimizer = {
+            'model1':
+            torch.optim.SGD(model.linear.parameters(), lr=0.02, momentum=0.95),
+            'model2':
+            torch.optim.SGD(model.conv.parameters(), lr=0.01, momentum=0.9),
+        }
+    else:
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.02, momentum=0.95)
+
+    tmp_dir = tempfile.mkdtemp()
+    runner = build_runner(
+        dict(type=runner_type),
+        default_args=dict(
+            model=model,
+            work_dir=tmp_dir,
+            optimizer=optimizer,
+            logger=logging.getLogger(),
+            max_epochs=max_epochs,
+            max_iters=max_iters))
+    return runner
+
+
+def _build_demo_runner(runner_type='EpochBasedRunner',
+                       max_epochs=1,
+                       max_iters=None,
+                       multi_optimziers=False):
+    log_config = dict(
+        interval=1, hooks=[
+            dict(type='TextLoggerHook'),
+        ])
+
+    runner = _build_demo_runner_without_hook(runner_type, max_epochs,
+                                             max_iters, multi_optimziers)
+
+    runner.register_checkpoint_hook(dict(interval=1))
+    runner.register_logger_hooks(log_config)
+    return runner
+
+
+@pytest.mark.parametrize('multi_optimziers', (True, False))
+def test_yolox_lrupdater_hook(multi_optimziers):
+    """xdoctest -m tests/test_hooks.py test_cosine_runner_hook."""
+    # Only used to prevent program errors
+    YOLOXLrUpdaterHook(0, min_lr_ratio=0.05)
+
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+
+    hook_cfg = dict(
+        type='YOLOXLrUpdaterHook',
+        warmup='exp',
+        by_epoch=False,
+        warmup_by_epoch=True,
+        warmup_ratio=1,
+        warmup_iters=5,  # 5 epoch
+        num_last_epochs=15,
+        min_lr_ratio=0.05)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimziers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 8.000000000000001e-06,
+                    'learning_rate/model2': 4.000000000000001e-06,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.00039200000000000004,
+                    'learning_rate/model2': 0.00019600000000000002,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9
+                }, 7),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0008000000000000001,
+                    'learning_rate/model2': 0.0004000000000000001,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 8.000000000000001e-06,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.00039200000000000004,
+                'momentum': 0.95
+            }, 7),
+            call('train', {
+                'learning_rate': 0.0008000000000000001,
+                'momentum': 0.95
+            }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+def test_ema_hook():
+    """xdoctest -m tests/test_hooks.py test_ema_hook."""
+
+    class DemoModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(
+                in_channels=1,
+                out_channels=2,
+                kernel_size=1,
+                padding=1,
+                bias=True)
+            self.bn = nn.BatchNorm2d(2)
+
+            self._init_weight()
+
+        def _init_weight(self):
+            constant_(self.conv.weight, 0)
+            constant_(self.conv.bias, 0)
+            constant_(self.bn.weight, 0)
+            constant_(self.bn.bias, 0)
+
+        def forward(self, x):
+            return self.bn(self.conv(x)).sum()
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+    loader = DataLoader(torch.ones((1, 1, 1, 1)))
+    runner = _build_demo_runner()
+    demo_model = DemoModel()
+    runner.model = demo_model
+    ema_hook = ExpMomentumEMAHook(
+        momentum=0.0002,
+        total_iter=1,
+        skip_buffers=True,
+        interval=2,
+        resume_from=None)
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(ema_hook, priority='HIGHEST')
+    runner.register_hook(checkpointhook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    checkpoint = torch.load(f'{runner.work_dir}/epoch_1.pth')
+    num_eam_params = 0
+    for name, value in checkpoint['state_dict'].items():
+        if 'ema' in name:
+            num_eam_params += 1
+            value.fill_(1)
+    assert num_eam_params == 4
+    torch.save(checkpoint, f'{runner.work_dir}/epoch_1.pth')
+
+    work_dir = runner.work_dir
+    resume_ema_hook = ExpMomentumEMAHook(
+        momentum=0.5,
+        total_iter=10,
+        skip_buffers=True,
+        interval=1,
+        resume_from=f'{work_dir}/epoch_1.pth')
+    runner = _build_demo_runner(max_epochs=2)
+    runner.model = demo_model
+    runner.register_hook(resume_ema_hook, priority='HIGHEST')
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(checkpointhook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    checkpoint = torch.load(f'{runner.work_dir}/epoch_2.pth')
+    num_eam_params = 0
+    desired_output = [0.9094, 0.9094]
+    for name, value in checkpoint['state_dict'].items():
+        if 'ema' in name:
+            num_eam_params += 1
+            assert value.sum() == 2
+        else:
+            if ('weight' in name) or ('bias' in name):
+                np.allclose(value.data.cpu().numpy().reshape(-1),
+                            desired_output, 1e-4)
+    assert num_eam_params == 4
+    shutil.rmtree(runner.work_dir)
+    shutil.rmtree(work_dir)
+
+
+def test_sync_norm_hook():
+    # Only used to prevent program errors
+    SyncNormHook()
+
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner()
+    runner.register_hook_from_cfg(dict(type='SyncNormHook'))
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+
+def test_sync_random_size_hook():
+    # Only used to prevent program errors
+    SyncRandomSizeHook()
+
+    class DemoDataset(Dataset):
+
+        def __getitem__(self, item):
+            return torch.ones(2)
+
+        def __len__(self):
+            return 5
+
+        def update_dynamic_scale(self, dynamic_scale):
+            pass
+
+    loader = DataLoader(DemoDataset())
+    runner = _build_demo_runner()
+    runner.register_hook_from_cfg(
+        dict(type='SyncRandomSizeHook', device='cpu'))
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    if torch.cuda.is_available():
+        runner = _build_demo_runner()
+        runner.register_hook_from_cfg(
+            dict(type='SyncRandomSizeHook', device='cuda'))
+        runner.run([loader, loader], [('train', 1), ('val', 1)])
+        shutil.rmtree(runner.work_dir)
+
+
+@pytest.mark.parametrize('set_loss', [
+    dict(set_loss_nan=False, set_loss_inf=False),
+    dict(set_loss_nan=True, set_loss_inf=False),
+    dict(set_loss_nan=False, set_loss_inf=True)
+])
+def test_check_invalid_loss_hook(set_loss):
+    # Check whether loss is valid during training.
+
+    class DemoModel(nn.Module):
+
+        def __init__(self, set_loss_nan=False, set_loss_inf=False):
+            super().__init__()
+            self.set_loss_nan = set_loss_nan
+            self.set_loss_inf = set_loss_inf
+            self.linear = nn.Linear(2, 1)
+
+        def forward(self, x):
+            return self.linear(x)
+
+        def train_step(self, x, optimizer, **kwargs):
+            if self.set_loss_nan:
+                return dict(loss=torch.tensor(float('nan')))
+            elif self.set_loss_inf:
+                return dict(loss=torch.tensor(float('inf')))
+            else:
+                return dict(loss=self(x))
+
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner()
+
+    demo_model = DemoModel(**set_loss)
+    runner.model = demo_model
+    runner.register_hook_from_cfg(
+        dict(type='CheckInvalidLossHook', interval=1))
+    if not set_loss['set_loss_nan'] \
+            and not set_loss['set_loss_inf']:
+        # check loss is valid
+        runner.run([loader], [('train', 1)])
+    else:
+        # check loss is nan or inf
+        with pytest.raises(AssertionError):
+            runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+
+def test_set_epoch_info_hook():
+    """Test SetEpochInfoHook."""
+
+    class DemoModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.epoch = 0
+            self.linear = nn.Linear(2, 1)
+
+        def forward(self, x):
+            return self.linear(x)
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def set_epoch(self, epoch):
+            self.epoch = epoch
+
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner(max_epochs=3)
+
+    demo_model = DemoModel()
+    runner.model = demo_model
+    runner.register_hook_from_cfg(dict(type='SetEpochInfoHook'))
+    runner.run([loader], [('train', 1)])
+    assert demo_model.epoch == 2
+
+
+def test_memory_profiler_hook():
+    from collections import namedtuple
+
+    # test ImportError without psutil and memory_profiler
+    with pytest.raises(ImportError):
+        from mmdet.core.hook import MemoryProfilerHook
+        MemoryProfilerHook(1)
+
+    # test ImportError without memory_profiler
+    sys.modules['psutil'] = MagicMock()
+    with pytest.raises(ImportError):
+        from mmdet.core.hook import MemoryProfilerHook
+        MemoryProfilerHook(1)
+
+    sys.modules['memory_profiler'] = MagicMock()
+
+    def _mock_virtual_memory():
+        virtual_memory_type = namedtuple(
+            'virtual_memory', ['total', 'available', 'percent', 'used'])
+        return virtual_memory_type(
+            total=270109085696,
+            available=250416816128,
+            percent=7.3,
+            used=17840881664)
+
+    def _mock_swap_memory():
+        swap_memory_type = namedtuple('swap_memory', [
+            'total',
+            'used',
+            'percent',
+        ])
+        return swap_memory_type(total=8589930496, used=0, percent=0.0)
+
+    def _mock_memory_usage():
+        return [40.22265625]
+
+    mock_virtual_memory = Mock(return_value=_mock_virtual_memory())
+    mock_swap_memory = Mock(return_value=_mock_swap_memory())
+    mock_memory_usage = Mock(return_value=_mock_memory_usage())
+
+    @patch('psutil.swap_memory', mock_swap_memory)
+    @patch('psutil.virtual_memory', mock_virtual_memory)
+    @patch('memory_profiler.memory_usage', mock_memory_usage)
+    def _test_memory_profiler_hook():
+        from mmdet.core.hook import MemoryProfilerHook
+        hook = MemoryProfilerHook(1)
+        runner = _build_demo_runner()
+
+        assert not mock_memory_usage.called
+        assert not mock_swap_memory.called
+        assert not mock_memory_usage.called
+
+        hook.after_iter(runner)
+
+        assert mock_memory_usage.called
+        assert mock_swap_memory.called
+        assert mock_memory_usage.called
+
+    _test_memory_profiler_hook()
diff --git a/tests/test_utils/test_layer_decay_optimizer_constructor.py b/tests/test_utils/test_layer_decay_optimizer_constructor.py
new file mode 100755
index 0000000..674f665
--- /dev/null
+++ b/tests/test_utils/test_layer_decay_optimizer_constructor.py
@@ -0,0 +1,164 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmdet.core.optimizers import LearningRateDecayOptimizerConstructor
+
+base_lr = 1
+decay_rate = 2
+base_wd = 0.05
+weight_decay = 0.05
+
+expected_stage_wise_lr_wd_convnext = [{
+    'weight_decay': 0.0,
+    'lr_scale': 128
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 1
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 64
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 64
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 32
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 32
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 16
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 16
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 8
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 8
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 128
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 1
+}]
+
+expected_layer_wise_lr_wd_convnext = [{
+    'weight_decay': 0.0,
+    'lr_scale': 128
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 1
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 64
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 64
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 32
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 32
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 16
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 16
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 2
+}, {
+    'weight_decay': 0.0,
+    'lr_scale': 2
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 128
+}, {
+    'weight_decay': 0.05,
+    'lr_scale': 1
+}]
+
+
+class ToyConvNeXt(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.stages = nn.ModuleList()
+        for i in range(4):
+            stage = nn.Sequential(ConvModule(3, 4, kernel_size=1, bias=True))
+            self.stages.append(stage)
+        self.norm0 = nn.BatchNorm2d(2)
+
+        # add some variables to meet unit test coverate rate
+        self.cls_token = nn.Parameter(torch.ones(1))
+        self.mask_token = nn.Parameter(torch.ones(1))
+        self.pos_embed = nn.Parameter(torch.ones(1))
+        self.stem_norm = nn.Parameter(torch.ones(1))
+        self.downsample_norm0 = nn.BatchNorm2d(2)
+        self.downsample_norm1 = nn.BatchNorm2d(2)
+        self.downsample_norm2 = nn.BatchNorm2d(2)
+        self.lin = nn.Parameter(torch.ones(1))
+        self.lin.requires_grad = False
+        self.downsample_layers = nn.ModuleList()
+        for _ in range(4):
+            stage = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=True))
+            self.downsample_layers.append(stage)
+
+
+class ToyDetector(nn.Module):
+
+    def __init__(self, backbone):
+        super().__init__()
+        self.backbone = backbone
+        self.head = nn.Conv2d(2, 2, kernel_size=1, groups=2)
+
+
+class PseudoDataParallel(nn.Module):
+
+    def __init__(self, model):
+        super().__init__()
+        self.module = model
+
+
+def check_optimizer_lr_wd(optimizer, gt_lr_wd):
+    assert isinstance(optimizer, torch.optim.AdamW)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['weight_decay'] == base_wd
+    param_groups = optimizer.param_groups
+    print(param_groups)
+    assert len(param_groups) == len(gt_lr_wd)
+    for i, param_dict in enumerate(param_groups):
+        assert param_dict['weight_decay'] == gt_lr_wd[i]['weight_decay']
+        assert param_dict['lr_scale'] == gt_lr_wd[i]['lr_scale']
+        assert param_dict['lr_scale'] == param_dict['lr']
+
+
+def test_learning_rate_decay_optimizer_constructor():
+
+    # Test lr wd for ConvNeXT
+    backbone = ToyConvNeXt()
+    model = PseudoDataParallel(ToyDetector(backbone))
+    optimizer_cfg = dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)
+    # stagewise decay
+    stagewise_paramwise_cfg = dict(
+        decay_rate=decay_rate, decay_type='stage_wise', num_layers=6)
+    optim_constructor = LearningRateDecayOptimizerConstructor(
+        optimizer_cfg, stagewise_paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_optimizer_lr_wd(optimizer, expected_stage_wise_lr_wd_convnext)
+    # layerwise decay
+    layerwise_paramwise_cfg = dict(
+        decay_rate=decay_rate, decay_type='layer_wise', num_layers=6)
+    optim_constructor = LearningRateDecayOptimizerConstructor(
+        optimizer_cfg, layerwise_paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_optimizer_lr_wd(optimizer, expected_layer_wise_lr_wd_convnext)
diff --git a/tests/test_utils/test_logger.py b/tests/test_utils/test_logger.py
new file mode 100755
index 0000000..900d6b6
--- /dev/null
+++ b/tests/test_utils/test_logger.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+from mmdet.utils import get_caller_name, log_img_scale
+
+
+def callee_func():
+    caller_name = get_caller_name()
+    return caller_name
+
+
+class CallerClassForTest:
+
+    def __init__(self):
+        self.caller_name = callee_func()
+
+
+def test_get_caller_name():
+    # test the case that caller is a function
+    caller_name = callee_func()
+    assert caller_name == 'test_get_caller_name'
+
+    # test the case that caller is a method in a class
+    caller_class = CallerClassForTest()
+    assert caller_class.caller_name == 'CallerClassForTest.__init__'
+
+
+def test_log_img_scale():
+    img_scale = (800, 1333)
+    done_logging = log_img_scale(img_scale)
+    assert done_logging
+
+    img_scale = (1333, 800)
+    done_logging = log_img_scale(img_scale, shape_order='wh')
+    assert done_logging
+
+    with pytest.raises(ValueError):
+        img_scale = (1333, 800)
+        done_logging = log_img_scale(img_scale, shape_order='xywh')
+
+    img_scale = (640, 640)
+    done_logging = log_img_scale(img_scale, skip_square=False)
+    assert done_logging
+
+    img_scale = (640, 640)
+    done_logging = log_img_scale(img_scale, skip_square=True)
+    assert not done_logging
diff --git a/tests/test_utils/test_masks.py b/tests/test_utils/test_masks.py
new file mode 100755
index 0000000..226ca61
--- /dev/null
+++ b/tests/test_utils/test_masks.py
@@ -0,0 +1,713 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet.core import BitmapMasks, PolygonMasks, mask2bbox
+
+
+def dummy_raw_bitmap_masks(size):
+    """
+    Args:
+        size (tuple): expected shape of dummy masks, (H, W) or (N, H, W)
+
+    Return:
+        ndarray: dummy mask
+    """
+    return np.random.randint(0, 2, size, dtype=np.uint8)
+
+
+def dummy_raw_polygon_masks(size):
+    """
+    Args:
+        size (tuple): expected shape of dummy masks, (N, H, W)
+
+    Return:
+        list[list[ndarray]]: dummy mask
+    """
+    num_obj, height, width = size
+    polygons = []
+    for _ in range(num_obj):
+        num_points = np.random.randint(5) * 2 + 6
+        polygons.append([np.random.uniform(0, min(height, width), num_points)])
+    return polygons
+
+
+def dummy_bboxes(num, max_height, max_width):
+    x1y1 = np.random.randint(0, min(max_height // 2, max_width // 2), (num, 2))
+    wh = np.random.randint(0, min(max_height // 2, max_width // 2), (num, 2))
+    x2y2 = x1y1 + wh
+    return np.concatenate([x1y1, x2y2], axis=1).squeeze().astype(np.float32)
+
+
+def test_bitmap_mask_init():
+    # init with empty ndarray masks
+    raw_masks = np.empty((0, 28, 28), dtype=np.uint8)
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    assert len(bitmap_masks) == 0
+    assert bitmap_masks.height == 28
+    assert bitmap_masks.width == 28
+
+    # init with empty list masks
+    raw_masks = []
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    assert len(bitmap_masks) == 0
+    assert bitmap_masks.height == 28
+    assert bitmap_masks.width == 28
+
+    # init with ndarray masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    assert len(bitmap_masks) == 3
+    assert bitmap_masks.height == 28
+    assert bitmap_masks.width == 28
+
+    # init with list masks contain 3 instances
+    raw_masks = [dummy_raw_bitmap_masks((28, 28)) for _ in range(3)]
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    assert len(bitmap_masks) == 3
+    assert bitmap_masks.height == 28
+    assert bitmap_masks.width == 28
+
+    # init with raw masks of unsupported type
+    with pytest.raises(AssertionError):
+        raw_masks = [[dummy_raw_bitmap_masks((28, 28))]]
+        BitmapMasks(raw_masks, 28, 28)
+
+
+def test_bitmap_mask_rescale():
+    # rescale with empty bitmap masks
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    rescaled_masks = bitmap_masks.rescale((56, 72))
+    assert len(rescaled_masks) == 0
+    assert rescaled_masks.height == 56
+    assert rescaled_masks.width == 56
+
+    # rescale with bitmap masks contain 1 instances
+    raw_masks = np.array([[[1, 0, 0, 0], [0, 1, 0, 1]]])
+    bitmap_masks = BitmapMasks(raw_masks, 2, 4)
+    rescaled_masks = bitmap_masks.rescale((8, 8))
+    assert len(rescaled_masks) == 1
+    assert rescaled_masks.height == 4
+    assert rescaled_masks.width == 8
+    truth = np.array([[[1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0],
+                       [0, 0, 1, 1, 0, 0, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1]]])
+    assert (rescaled_masks.masks == truth).all()
+
+
+def test_bitmap_mask_resize():
+    # resize with empty bitmap masks
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    resized_masks = bitmap_masks.resize((56, 72))
+    assert len(resized_masks) == 0
+    assert resized_masks.height == 56
+    assert resized_masks.width == 72
+
+    # resize with bitmap masks contain 1 instances
+    raw_masks = np.diag(np.ones(4, dtype=np.uint8))[np.newaxis, ...]
+    bitmap_masks = BitmapMasks(raw_masks, 4, 4)
+    resized_masks = bitmap_masks.resize((8, 8))
+    assert len(resized_masks) == 1
+    assert resized_masks.height == 8
+    assert resized_masks.width == 8
+    truth = np.array([[[1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0],
+                       [0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0],
+                       [0, 0, 0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0],
+                       [0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1]]])
+    assert (resized_masks.masks == truth).all()
+
+    # resize to non-square
+    raw_masks = np.diag(np.ones(4, dtype=np.uint8))[np.newaxis, ...]
+    bitmap_masks = BitmapMasks(raw_masks, 4, 4)
+    resized_masks = bitmap_masks.resize((4, 8))
+    assert len(resized_masks) == 1
+    assert resized_masks.height == 4
+    assert resized_masks.width == 8
+    truth = np.array([[[1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0],
+                       [0, 0, 0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1]]])
+    assert (resized_masks.masks == truth).all()
+
+
+def test_bitmap_mask_get_bboxes():
+    # resize with empty bitmap masks
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    bboxes = bitmap_masks.get_bboxes()
+    assert len(bboxes) == 0
+
+    # resize with bitmap masks contain 1 instances
+    raw_masks = np.array([[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, 0, 0],
+                           [0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 1, 0, 0, 0],
+                           [0, 0, 1, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0,
+                                                      0]]])
+    bitmap_masks = BitmapMasks(raw_masks, 8, 8)
+    bboxes = bitmap_masks.get_bboxes()
+    assert len(bboxes) == 1
+    truth = np.array([[1, 1, 6, 6]])
+    assert (bboxes == truth).all()
+
+    # resize to non-square
+    raw_masks = np.array([[[1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0,
+                                                      0]]])
+    bitmap_masks = BitmapMasks(raw_masks, 4, 8)
+    bboxes = bitmap_masks.get_bboxes()
+    truth = np.array([[0, 0, 6, 3]])
+    assert (bboxes == truth).all()
+
+
+def test_bitmap_mask_flip():
+    # flip with empty bitmap masks
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    flipped_masks = bitmap_masks.flip(flip_direction='horizontal')
+    assert len(flipped_masks) == 0
+    assert flipped_masks.height == 28
+    assert flipped_masks.width == 28
+
+    # horizontally flip with bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    flipped_masks = bitmap_masks.flip(flip_direction='horizontal')
+    flipped_flipped_masks = flipped_masks.flip(flip_direction='horizontal')
+    assert flipped_masks.masks.shape == (3, 28, 28)
+    assert (bitmap_masks.masks == flipped_flipped_masks.masks).all()
+    assert (flipped_masks.masks == raw_masks[:, :, ::-1]).all()
+
+    # vertically flip with bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    flipped_masks = bitmap_masks.flip(flip_direction='vertical')
+    flipped_flipped_masks = flipped_masks.flip(flip_direction='vertical')
+    assert len(flipped_masks) == 3
+    assert flipped_masks.height == 28
+    assert flipped_masks.width == 28
+    assert (bitmap_masks.masks == flipped_flipped_masks.masks).all()
+    assert (flipped_masks.masks == raw_masks[:, ::-1, :]).all()
+
+    # diagonal flip with bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    flipped_masks = bitmap_masks.flip(flip_direction='diagonal')
+    flipped_flipped_masks = flipped_masks.flip(flip_direction='diagonal')
+    assert len(flipped_masks) == 3
+    assert flipped_masks.height == 28
+    assert flipped_masks.width == 28
+    assert (bitmap_masks.masks == flipped_flipped_masks.masks).all()
+    assert (flipped_masks.masks == raw_masks[:, ::-1, ::-1]).all()
+
+
+def test_bitmap_mask_pad():
+    # pad with empty bitmap masks
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    padded_masks = bitmap_masks.pad((56, 56))
+    assert len(padded_masks) == 0
+    assert padded_masks.height == 56
+    assert padded_masks.width == 56
+
+    # pad with bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    padded_masks = bitmap_masks.pad((56, 56))
+    assert len(padded_masks) == 3
+    assert padded_masks.height == 56
+    assert padded_masks.width == 56
+    assert (padded_masks.masks[:, 28:, 28:] == 0).all()
+
+
+def test_bitmap_mask_crop():
+    # crop with empty bitmap masks
+    dummy_bbox = np.array([0, 10, 10, 27], dtype=np.int)
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    cropped_masks = bitmap_masks.crop(dummy_bbox)
+    assert len(cropped_masks) == 0
+    assert cropped_masks.height == 17
+    assert cropped_masks.width == 10
+
+    # crop with bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    cropped_masks = bitmap_masks.crop(dummy_bbox)
+    assert len(cropped_masks) == 3
+    assert cropped_masks.height == 17
+    assert cropped_masks.width == 10
+    x1, y1, x2, y2 = dummy_bbox
+    assert (cropped_masks.masks == raw_masks[:, y1:y2, x1:x2]).all()
+
+    # crop with invalid bbox
+    with pytest.raises(AssertionError):
+        dummy_bbox = dummy_bboxes(2, 28, 28)
+        bitmap_masks.crop(dummy_bbox)
+
+
+def test_bitmap_mask_crop_and_resize():
+    dummy_bbox = dummy_bboxes(5, 28, 28)
+    inds = np.random.randint(0, 3, (5, ))
+
+    # crop and resize with empty bitmap masks
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    cropped_resized_masks = bitmap_masks.crop_and_resize(
+        dummy_bbox, (56, 56), inds)
+    assert len(cropped_resized_masks) == 0
+    assert cropped_resized_masks.height == 56
+    assert cropped_resized_masks.width == 56
+
+    # crop and resize with bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    cropped_resized_masks = bitmap_masks.crop_and_resize(
+        dummy_bbox, (56, 56), inds)
+    assert len(cropped_resized_masks) == 5
+    assert cropped_resized_masks.height == 56
+    assert cropped_resized_masks.width == 56
+
+
+def test_bitmap_mask_expand():
+    # expand with empty bitmap masks
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    expanded_masks = bitmap_masks.expand(56, 56, 12, 14)
+    assert len(expanded_masks) == 0
+    assert expanded_masks.height == 56
+    assert expanded_masks.width == 56
+
+    # expand with bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    expanded_masks = bitmap_masks.expand(56, 56, 12, 14)
+    assert len(expanded_masks) == 3
+    assert expanded_masks.height == 56
+    assert expanded_masks.width == 56
+    assert (expanded_masks.masks[:, :12, :14] == 0).all()
+    assert (expanded_masks.masks[:, 12 + 28:, 14 + 28:] == 0).all()
+
+
+def test_bitmap_mask_area():
+    # area of empty bitmap mask
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    assert bitmap_masks.areas.sum() == 0
+
+    # area of bitmap masks contain 3 instances
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    areas = bitmap_masks.areas
+    assert len(areas) == 3
+    assert (areas == raw_masks.sum((1, 2))).all()
+
+
+def test_bitmap_mask_to_ndarray():
+    # empty bitmap masks to ndarray
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    ndarray_masks = bitmap_masks.to_ndarray()
+    assert isinstance(ndarray_masks, np.ndarray)
+    assert ndarray_masks.shape == (0, 28, 28)
+
+    # bitmap masks contain 3 instances to ndarray
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    ndarray_masks = bitmap_masks.to_ndarray()
+    assert isinstance(ndarray_masks, np.ndarray)
+    assert ndarray_masks.shape == (3, 28, 28)
+    assert (ndarray_masks == raw_masks).all()
+
+
+def test_bitmap_mask_to_tensor():
+    # empty bitmap masks to tensor
+    raw_masks = dummy_raw_bitmap_masks((0, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    tensor_masks = bitmap_masks.to_tensor(dtype=torch.uint8, device='cpu')
+    assert isinstance(tensor_masks, torch.Tensor)
+    assert tensor_masks.shape == (0, 28, 28)
+
+    # bitmap masks contain 3 instances to tensor
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    tensor_masks = bitmap_masks.to_tensor(dtype=torch.uint8, device='cpu')
+    assert isinstance(tensor_masks, torch.Tensor)
+    assert tensor_masks.shape == (3, 28, 28)
+    assert (tensor_masks.numpy() == raw_masks).all()
+
+
+def test_bitmap_mask_index():
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    assert (bitmap_masks[0].masks == raw_masks[0]).all()
+    assert (bitmap_masks[range(2)].masks == raw_masks[range(2)]).all()
+
+
+def test_bitmap_mask_iter():
+    raw_masks = dummy_raw_bitmap_masks((3, 28, 28))
+    bitmap_masks = BitmapMasks(raw_masks, 28, 28)
+    for i, bitmap_mask in enumerate(bitmap_masks):
+        assert bitmap_mask.shape == (28, 28)
+        assert (bitmap_mask == raw_masks[i]).all()
+
+
+def test_polygon_mask_init():
+    # init with empty masks
+    raw_masks = []
+    polygon_masks = BitmapMasks(raw_masks, 28, 28)
+    assert len(polygon_masks) == 0
+    assert polygon_masks.height == 28
+    assert polygon_masks.width == 28
+
+    # init with masks contain 3 instances
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    assert isinstance(polygon_masks.masks, list)
+    assert isinstance(polygon_masks.masks[0], list)
+    assert isinstance(polygon_masks.masks[0][0], np.ndarray)
+    assert len(polygon_masks) == 3
+    assert polygon_masks.height == 28
+    assert polygon_masks.width == 28
+    assert polygon_masks.to_ndarray().shape == (3, 28, 28)
+
+    # init with raw masks of unsupported type
+    with pytest.raises(AssertionError):
+        raw_masks = [[[]]]
+        PolygonMasks(raw_masks, 28, 28)
+
+        raw_masks = [dummy_raw_polygon_masks((3, 28, 28))]
+        PolygonMasks(raw_masks, 28, 28)
+
+
+def test_polygon_mask_rescale():
+    # rescale with empty polygon masks
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    rescaled_masks = polygon_masks.rescale((56, 72))
+    assert len(rescaled_masks) == 0
+    assert rescaled_masks.height == 56
+    assert rescaled_masks.width == 56
+    assert rescaled_masks.to_ndarray().shape == (0, 56, 56)
+
+    # rescale with polygon masks contain 3 instances
+    raw_masks = [[np.array([1, 1, 3, 1, 4, 3, 2, 4, 1, 3], dtype=np.float)]]
+    polygon_masks = PolygonMasks(raw_masks, 5, 5)
+    rescaled_masks = polygon_masks.rescale((12, 10))
+    assert len(rescaled_masks) == 1
+    assert rescaled_masks.height == 10
+    assert rescaled_masks.width == 10
+    assert rescaled_masks.to_ndarray().shape == (1, 10, 10)
+    truth = np.array(
+        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+         [0, 0, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
+         [0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 1, 0, 0],
+         [0, 0, 0, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+        np.uint8)
+    assert (rescaled_masks.to_ndarray() == truth).all()
+
+
+def test_polygon_mask_resize():
+    # resize with empty polygon masks
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    resized_masks = polygon_masks.resize((56, 72))
+    assert len(resized_masks) == 0
+    assert resized_masks.height == 56
+    assert resized_masks.width == 72
+    assert resized_masks.to_ndarray().shape == (0, 56, 72)
+    assert len(resized_masks.get_bboxes()) == 0
+
+    # resize with polygon masks contain 1 instance 1 part
+    raw_masks1 = [[np.array([1, 1, 3, 1, 4, 3, 2, 4, 1, 3], dtype=np.float)]]
+    polygon_masks1 = PolygonMasks(raw_masks1, 5, 5)
+    resized_masks1 = polygon_masks1.resize((10, 10))
+    assert len(resized_masks1) == 1
+    assert resized_masks1.height == 10
+    assert resized_masks1.width == 10
+    assert resized_masks1.to_ndarray().shape == (1, 10, 10)
+    truth1 = np.array(
+        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+         [0, 0, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
+         [0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 1, 0, 0],
+         [0, 0, 0, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+        np.uint8)
+    assert (resized_masks1.to_ndarray() == truth1).all()
+    bboxes = resized_masks1.get_bboxes()
+    bbox_truth = np.array([[2, 2, 8, 8]])
+    assert (bboxes == bbox_truth).all()
+
+    # resize with polygon masks contain 1 instance 2 part
+    raw_masks2 = [[
+        np.array([0., 0., 1., 0., 1., 1.]),
+        np.array([1., 1., 2., 1., 2., 2., 1., 2.])
+    ]]
+    polygon_masks2 = PolygonMasks(raw_masks2, 3, 3)
+    resized_masks2 = polygon_masks2.resize((6, 6))
+    assert len(resized_masks2) == 1
+    assert resized_masks2.height == 6
+    assert resized_masks2.width == 6
+    assert resized_masks2.to_ndarray().shape == (1, 6, 6)
+    truth2 = np.array(
+        [[0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0],
+         [0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], np.uint8)
+    assert (resized_masks2.to_ndarray() == truth2).all()
+
+    # resize with polygon masks contain 2 instances
+    raw_masks3 = [raw_masks1[0], raw_masks2[0]]
+    polygon_masks3 = PolygonMasks(raw_masks3, 5, 5)
+    resized_masks3 = polygon_masks3.resize((10, 10))
+    assert len(resized_masks3) == 2
+    assert resized_masks3.height == 10
+    assert resized_masks3.width == 10
+    assert resized_masks3.to_ndarray().shape == (2, 10, 10)
+    truth3 = np.stack([truth1, np.pad(truth2, ((0, 4), (0, 4)), 'constant')])
+    assert (resized_masks3.to_ndarray() == truth3).all()
+
+    # resize to non-square
+    raw_masks4 = [[np.array([1, 1, 3, 1, 4, 3, 2, 4, 1, 3], dtype=np.float)]]
+    polygon_masks4 = PolygonMasks(raw_masks4, 5, 5)
+    resized_masks4 = polygon_masks4.resize((5, 10))
+    assert len(resized_masks4) == 1
+    assert resized_masks4.height == 5
+    assert resized_masks4.width == 10
+    assert resized_masks4.to_ndarray().shape == (1, 5, 10)
+    truth4 = np.array(
+        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
+         [0, 0, 1, 1, 1, 1, 1, 1, 0, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], np.uint8)
+    assert (resized_masks4.to_ndarray() == truth4).all()
+
+
+def test_polygon_mask_flip():
+    # flip with empty polygon masks
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    flipped_masks = polygon_masks.flip(flip_direction='horizontal')
+    assert len(flipped_masks) == 0
+    assert flipped_masks.height == 28
+    assert flipped_masks.width == 28
+    assert flipped_masks.to_ndarray().shape == (0, 28, 28)
+
+    # TODO: fixed flip correctness checking after v2.0_coord is merged
+    # horizontally flip with polygon masks contain 3 instances
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    flipped_masks = polygon_masks.flip(flip_direction='horizontal')
+    flipped_flipped_masks = flipped_masks.flip(flip_direction='horizontal')
+    assert len(flipped_masks) == 3
+    assert flipped_masks.height == 28
+    assert flipped_masks.width == 28
+    assert flipped_masks.to_ndarray().shape == (3, 28, 28)
+    assert (polygon_masks.to_ndarray() == flipped_flipped_masks.to_ndarray()
+            ).all()
+
+    # vertically flip with polygon masks contain 3 instances
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    flipped_masks = polygon_masks.flip(flip_direction='vertical')
+    flipped_flipped_masks = flipped_masks.flip(flip_direction='vertical')
+    assert len(flipped_masks) == 3
+    assert flipped_masks.height == 28
+    assert flipped_masks.width == 28
+    assert flipped_masks.to_ndarray().shape == (3, 28, 28)
+    assert (polygon_masks.to_ndarray() == flipped_flipped_masks.to_ndarray()
+            ).all()
+
+    # diagonal flip with polygon masks contain 3 instances
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    flipped_masks = polygon_masks.flip(flip_direction='diagonal')
+    flipped_flipped_masks = flipped_masks.flip(flip_direction='diagonal')
+    assert len(flipped_masks) == 3
+    assert flipped_masks.height == 28
+    assert flipped_masks.width == 28
+    assert flipped_masks.to_ndarray().shape == (3, 28, 28)
+    assert (polygon_masks.to_ndarray() == flipped_flipped_masks.to_ndarray()
+            ).all()
+
+
+def test_polygon_mask_crop():
+    dummy_bbox = np.array([0, 10, 10, 27], dtype=np.int)
+    # crop with empty polygon masks
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    cropped_masks = polygon_masks.crop(dummy_bbox)
+    assert len(cropped_masks) == 0
+    assert cropped_masks.height == 17
+    assert cropped_masks.width == 10
+    assert cropped_masks.to_ndarray().shape == (0, 17, 10)
+
+    # crop with polygon masks contain 1 instances
+    raw_masks = [[np.array([1., 3., 5., 1., 5., 6., 1, 6])]]
+    polygon_masks = PolygonMasks(raw_masks, 7, 7)
+    bbox = np.array([0, 0, 3, 4])
+    cropped_masks = polygon_masks.crop(bbox)
+    assert len(cropped_masks) == 1
+    assert cropped_masks.height == 4
+    assert cropped_masks.width == 3
+    assert cropped_masks.to_ndarray().shape == (1, 4, 3)
+    truth = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 1], [0, 1, 1]])
+    assert (cropped_masks.to_ndarray() == truth).all()
+
+    # crop with invalid bbox
+    with pytest.raises(AssertionError):
+        dummy_bbox = dummy_bboxes(2, 28, 28)
+        polygon_masks.crop(dummy_bbox)
+
+
+def test_polygon_mask_pad():
+    # pad with empty polygon masks
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    padded_masks = polygon_masks.pad((56, 56))
+    assert len(padded_masks) == 0
+    assert padded_masks.height == 56
+    assert padded_masks.width == 56
+    assert padded_masks.to_ndarray().shape == (0, 56, 56)
+
+    # pad with polygon masks contain 3 instances
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    padded_masks = polygon_masks.pad((56, 56))
+    assert len(padded_masks) == 3
+    assert padded_masks.height == 56
+    assert padded_masks.width == 56
+    assert padded_masks.to_ndarray().shape == (3, 56, 56)
+    assert (padded_masks.to_ndarray()[:, 28:, 28:] == 0).all()
+
+
+def test_polygon_mask_expand():
+    with pytest.raises(NotImplementedError):
+        raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+        polygon_masks = PolygonMasks(raw_masks, 28, 28)
+        polygon_masks.expand(56, 56, 10, 17)
+
+
+def test_polygon_mask_crop_and_resize():
+    dummy_bbox = dummy_bboxes(5, 28, 28)
+    inds = np.random.randint(0, 3, (5, ))
+
+    # crop and resize with empty polygon masks
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    cropped_resized_masks = polygon_masks.crop_and_resize(
+        dummy_bbox, (56, 56), inds)
+    assert len(cropped_resized_masks) == 0
+    assert cropped_resized_masks.height == 56
+    assert cropped_resized_masks.width == 56
+    assert cropped_resized_masks.to_ndarray().shape == (0, 56, 56)
+
+    # crop and resize with polygon masks contain 3 instances
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    cropped_resized_masks = polygon_masks.crop_and_resize(
+        dummy_bbox, (56, 56), inds)
+    assert len(cropped_resized_masks) == 5
+    assert cropped_resized_masks.height == 56
+    assert cropped_resized_masks.width == 56
+    assert cropped_resized_masks.to_ndarray().shape == (5, 56, 56)
+
+
+def test_polygon_mask_area():
+    # area of empty polygon masks
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    assert polygon_masks.areas.sum() == 0
+
+    # area of polygon masks contain 1 instance
+    # here we hack a case that the gap between the area of bitmap and polygon
+    # is minor
+    raw_masks = [[np.array([1, 1, 5, 1, 3, 4])]]
+    polygon_masks = PolygonMasks(raw_masks, 6, 6)
+    polygon_area = polygon_masks.areas
+    bitmap_area = polygon_masks.to_bitmap().areas
+    assert len(polygon_area) == 1
+    assert np.isclose(polygon_area, bitmap_area).all()
+
+
+def test_polygon_mask_to_bitmap():
+    # polygon masks contain 3 instances to bitmap
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    bitmap_masks = polygon_masks.to_bitmap()
+    assert (polygon_masks.to_ndarray() == bitmap_masks.to_ndarray()).all()
+
+
+def test_polygon_mask_to_ndarray():
+    # empty polygon masks to ndarray
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    ndarray_masks = polygon_masks.to_ndarray()
+    assert isinstance(ndarray_masks, np.ndarray)
+    assert ndarray_masks.shape == (0, 28, 28)
+
+    # polygon masks contain 3 instances to ndarray
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    ndarray_masks = polygon_masks.to_ndarray()
+    assert isinstance(ndarray_masks, np.ndarray)
+    assert ndarray_masks.shape == (3, 28, 28)
+
+
+def test_polygon_to_tensor():
+    # empty polygon masks to tensor
+    raw_masks = dummy_raw_polygon_masks((0, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    tensor_masks = polygon_masks.to_tensor(dtype=torch.uint8, device='cpu')
+    assert isinstance(tensor_masks, torch.Tensor)
+    assert tensor_masks.shape == (0, 28, 28)
+
+    # polygon masks contain 3 instances to tensor
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    tensor_masks = polygon_masks.to_tensor(dtype=torch.uint8, device='cpu')
+    assert isinstance(tensor_masks, torch.Tensor)
+    assert tensor_masks.shape == (3, 28, 28)
+    assert (tensor_masks.numpy() == polygon_masks.to_ndarray()).all()
+
+
+def test_polygon_mask_index():
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    # index by integer
+    polygon_masks[0]
+    # index by list
+    polygon_masks[[0, 1]]
+    # index by ndarray
+    polygon_masks[np.asarray([0, 1])]
+    with pytest.raises(ValueError):
+        # invalid index
+        polygon_masks[torch.Tensor([1, 2])]
+
+
+def test_polygon_mask_iter():
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    for i, polygon_mask in enumerate(polygon_masks):
+        assert np.equal(polygon_mask, raw_masks[i]).all()
+
+
+def test_mask2bbox():
+    # no instance
+    masks = torch.zeros((1, 20, 15), dtype=torch.bool)
+    bboxes_empty_gt = torch.tensor([[0, 0, 0, 0]]).float()
+    bboxes = mask2bbox(masks)
+    assert torch.allclose(bboxes_empty_gt.float(), bboxes)
+
+    # the entire mask is an instance
+    bboxes_full_gt = torch.tensor([[0, 0, 15, 20]]).float()
+    masks = torch.ones((1, 20, 15), dtype=torch.bool)
+    bboxes = mask2bbox(masks)
+    assert torch.allclose(bboxes_full_gt, bboxes)
+
+    # a pentagon-shaped instance
+    bboxes_gt = torch.tensor([[2, 2, 7, 6]]).float()
+    masks = torch.zeros((1, 20, 15), dtype=torch.bool)
+    masks[0, 2, 4] = True
+    masks[0, 3, 3:6] = True
+    masks[0, 4, 2:7] = True
+    masks[0, 5, 2:7] = True
+    bboxes = mask2bbox(masks)
+    assert torch.allclose(bboxes_gt, bboxes)
diff --git a/tests/test_utils/test_memory.py b/tests/test_utils/test_memory.py
new file mode 100755
index 0000000..840601c
--- /dev/null
+++ b/tests/test_utils/test_memory.py
@@ -0,0 +1,98 @@
+import numpy as np
+import pytest
+import torch
+
+from mmdet.utils import AvoidOOM
+from mmdet.utils.memory import cast_tensor_type
+
+
+def test_avoidoom():
+    tensor = torch.from_numpy(np.random.random((20, 20)))
+    if torch.cuda.is_available():
+        tensor = tensor.cuda()
+        # get default result
+        default_result = torch.mm(tensor, tensor.transpose(1, 0))
+
+        # when not occurred OOM error
+        AvoidCudaOOM = AvoidOOM()
+        result = AvoidCudaOOM.retry_if_cuda_oom(torch.mm)(tensor,
+                                                          tensor.transpose(
+                                                              1, 0))
+        assert default_result.device == result.device and \
+               default_result.dtype == result.dtype and \
+               torch.equal(default_result, result)
+
+        # calculate with fp16 and convert back to source type
+        AvoidCudaOOM = AvoidOOM(test=True)
+        result = AvoidCudaOOM.retry_if_cuda_oom(torch.mm)(tensor,
+                                                          tensor.transpose(
+                                                              1, 0))
+        assert default_result.device == result.device and \
+               default_result.dtype == result.dtype and \
+               torch.allclose(default_result, result, 1e-3)
+
+        # calculate on cpu and convert back to source device
+        AvoidCudaOOM = AvoidOOM(test=True)
+        result = AvoidCudaOOM.retry_if_cuda_oom(torch.mm)(tensor,
+                                                          tensor.transpose(
+                                                              1, 0))
+        assert result.dtype == default_result.dtype and \
+               result.device == default_result.device and \
+               torch.allclose(default_result, result)
+
+        # do not calculate on cpu and the outputs will be same as input
+        AvoidCudaOOM = AvoidOOM(test=True, to_cpu=False)
+        result = AvoidCudaOOM.retry_if_cuda_oom(torch.mm)(tensor,
+                                                          tensor.transpose(
+                                                              1, 0))
+        assert result.dtype == default_result.dtype and \
+               result.device == default_result.device
+
+    else:
+        default_result = torch.mm(tensor, tensor.transpose(1, 0))
+        AvoidCudaOOM = AvoidOOM()
+        result = AvoidCudaOOM.retry_if_cuda_oom(torch.mm)(tensor,
+                                                          tensor.transpose(
+                                                              1, 0))
+        assert default_result.device == result.device and \
+               default_result.dtype == result.dtype and \
+               torch.equal(default_result, result)
+
+
+def test_cast_tensor_type():
+    inputs = torch.rand(10)
+    if torch.cuda.is_available():
+        inputs = inputs.cuda()
+    with pytest.raises(AssertionError):
+        cast_tensor_type(inputs, src_type=None, dst_type=None)
+    # input is a float
+    out = cast_tensor_type(10., dst_type=torch.half)
+    assert out == 10. and isinstance(out, float)
+    # convert Tensor to fp16 and re-convert to fp32
+    fp16_out = cast_tensor_type(inputs, dst_type=torch.half)
+    assert fp16_out.dtype == torch.half
+    fp32_out = cast_tensor_type(fp16_out, dst_type=torch.float32)
+    assert fp32_out.dtype == torch.float32
+
+    # input is a list
+    list_input = [inputs, inputs]
+    list_outs = cast_tensor_type(list_input, dst_type=torch.half)
+    assert len(list_outs) == len(list_input) and \
+           isinstance(list_outs, list)
+    for out in list_outs:
+        assert out.dtype == torch.half
+    # input is a dict
+    dict_input = {'test1': inputs, 'test2': inputs}
+    dict_outs = cast_tensor_type(dict_input, dst_type=torch.half)
+    assert len(dict_outs) == len(dict_input) and \
+           isinstance(dict_outs, dict)
+
+    # convert the input tensor to CPU and re-convert to GPU
+    if torch.cuda.is_available():
+        cpu_device = torch.empty(0).device
+        gpu_device = inputs.device
+        cpu_out = cast_tensor_type(inputs, dst_type=cpu_device)
+        assert cpu_out.device == cpu_device
+
+        gpu_out = cast_tensor_type(inputs, dst_type=gpu_device)
+        assert gpu_out.device == gpu_device
diff --git a/tests/test_utils/test_misc.py b/tests/test_utils/test_misc.py
new file mode 100755
index 0000000..80d9114
--- /dev/null
+++ b/tests/test_utils/test_misc.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+
+import numpy as np
+import pytest
+import torch
+
+from mmdet.core.bbox import distance2bbox
+from mmdet.core.mask.structures import BitmapMasks, PolygonMasks
+from mmdet.core.utils import (center_of_mass, filter_scores_and_topk,
+                              flip_tensor, mask2ndarray, select_single_mlvl)
+from mmdet.utils import find_latest_checkpoint
+
+
+def dummy_raw_polygon_masks(size):
+    """
+    Args:
+        size (tuple): expected shape of dummy masks, (N, H, W)
+
+    Return:
+        list[list[ndarray]]: dummy mask
+    """
+    num_obj, height, width = size
+    polygons = []
+    for _ in range(num_obj):
+        num_points = np.random.randint(5) * 2 + 6
+        polygons.append([np.random.uniform(0, min(height, width), num_points)])
+    return polygons
+
+
+def test_mask2ndarray():
+    raw_masks = np.ones((3, 28, 28))
+    bitmap_mask = BitmapMasks(raw_masks, 28, 28)
+    output_mask = mask2ndarray(bitmap_mask)
+    assert np.allclose(raw_masks, output_mask)
+
+    raw_masks = dummy_raw_polygon_masks((3, 28, 28))
+    polygon_masks = PolygonMasks(raw_masks, 28, 28)
+    output_mask = mask2ndarray(polygon_masks)
+    assert output_mask.shape == (3, 28, 28)
+
+    raw_masks = np.ones((3, 28, 28))
+    output_mask = mask2ndarray(raw_masks)
+    assert np.allclose(raw_masks, output_mask)
+
+    raw_masks = torch.ones((3, 28, 28))
+    output_mask = mask2ndarray(raw_masks)
+    assert np.allclose(raw_masks, output_mask)
+
+    # test unsupported type
+    raw_masks = []
+    with pytest.raises(TypeError):
+        output_mask = mask2ndarray(raw_masks)
+
+
+def test_distance2bbox():
+    point = torch.Tensor([[74., 61.], [-29., 106.], [138., 61.], [29., 170.]])
+
+    distance = torch.Tensor([[0., 0, 1., 1.], [1., 2., 10., 6.],
+                             [22., -29., 138., 61.], [54., -29., 170., 61.]])
+    expected_decode_bboxes = torch.Tensor([[74., 61., 75., 62.],
+                                           [0., 104., 0., 112.],
+                                           [100., 90., 100., 120.],
+                                           [0., 120., 100., 120.]])
+    out_bbox = distance2bbox(point, distance, max_shape=(120, 100))
+    assert expected_decode_bboxes.allclose(out_bbox)
+    out = distance2bbox(point, distance, max_shape=torch.Tensor((120, 100)))
+    assert expected_decode_bboxes.allclose(out)
+
+    batch_point = point.unsqueeze(0).repeat(2, 1, 1)
+    batch_distance = distance.unsqueeze(0).repeat(2, 1, 1)
+    batch_out = distance2bbox(
+        batch_point, batch_distance, max_shape=(120, 100))[0]
+    assert out.allclose(batch_out)
+    batch_out = distance2bbox(
+        batch_point, batch_distance, max_shape=[(120, 100), (120, 100)])[0]
+    assert out.allclose(batch_out)
+
+    batch_out = distance2bbox(point, batch_distance, max_shape=(120, 100))[0]
+    assert out.allclose(batch_out)
+
+    # test max_shape is not equal to batch
+    with pytest.raises(AssertionError):
+        distance2bbox(
+            batch_point,
+            batch_distance,
+            max_shape=[(120, 100), (120, 100), (32, 32)])
+
+    rois = torch.zeros((0, 4))
+    deltas = torch.zeros((0, 4))
+    out = distance2bbox(rois, deltas, max_shape=(120, 100))
+    assert rois.shape == out.shape
+
+    rois = torch.zeros((2, 0, 4))
+    deltas = torch.zeros((2, 0, 4))
+    out = distance2bbox(rois, deltas, max_shape=(120, 100))
+    assert rois.shape == out.shape
+
+
+@pytest.mark.parametrize('mask', [
+    torch.ones((28, 28)),
+    torch.zeros((28, 28)),
+    torch.rand(28, 28) > 0.5,
+    torch.tensor([[0, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 0]])
+])
+def test_center_of_mass(mask):
+    center_h, center_w = center_of_mass(mask)
+    if mask.shape[0] == 4:
+        assert center_h == 1.5
+        assert center_w == 1.5
+    assert isinstance(center_h, torch.Tensor) \
+           and isinstance(center_w, torch.Tensor)
+    assert 0 <= center_h <= 28 \
+           and 0 <= center_w <= 28
+
+
+def test_flip_tensor():
+    img = np.random.random((1, 3, 10, 10))
+    src_tensor = torch.from_numpy(img)
+
+    # test flip_direction parameter error
+    with pytest.raises(AssertionError):
+        flip_tensor(src_tensor, 'flip')
+
+    # test tensor dimension
+    with pytest.raises(AssertionError):
+        flip_tensor(src_tensor[0], 'vertical')
+
+    hfilp_tensor = flip_tensor(src_tensor, 'horizontal')
+    expected_hflip_tensor = torch.from_numpy(img[..., ::-1, :].copy())
+    expected_hflip_tensor.allclose(hfilp_tensor)
+
+    vfilp_tensor = flip_tensor(src_tensor, 'vertical')
+    expected_vflip_tensor = torch.from_numpy(img[..., ::-1].copy())
+    expected_vflip_tensor.allclose(vfilp_tensor)
+
+    diag_filp_tensor = flip_tensor(src_tensor, 'diagonal')
+    expected_diag_filp_tensor = torch.from_numpy(img[..., ::-1, ::-1].copy())
+    expected_diag_filp_tensor.allclose(diag_filp_tensor)
+
+
+def test_select_single_mlvl():
+    mlvl_tensors = [torch.rand(2, 1, 10, 10)] * 5
+    mlvl_tensor_list = select_single_mlvl(mlvl_tensors, 1)
+    assert len(mlvl_tensor_list) == 5 and mlvl_tensor_list[0].ndim == 3
+
+
+def test_filter_scores_and_topk():
+    score = torch.tensor([[0.1, 0.3, 0.2], [0.12, 0.7, 0.9], [0.02, 0.8, 0.08],
+                          [0.4, 0.1, 0.08]])
+    bbox_pred = torch.tensor([[0.2, 0.3], [0.4, 0.7], [0.1, 0.1], [0.5, 0.1]])
+    score_thr = 0.15
+    nms_pre = 4
+    # test results type error
+    with pytest.raises(NotImplementedError):
+        filter_scores_and_topk(score, score_thr, nms_pre, (score, ))
+
+    filtered_results = filter_scores_and_topk(
+        score, score_thr, nms_pre, results=dict(bbox_pred=bbox_pred))
+    filtered_score, labels, keep_idxs, results = filtered_results
+    assert filtered_score.allclose(torch.tensor([0.9, 0.8, 0.7, 0.4]))
+    assert labels.allclose(torch.tensor([2, 1, 1, 0]))
+    assert keep_idxs.allclose(torch.tensor([1, 2, 1, 3]))
+    assert results['bbox_pred'].allclose(
+        torch.tensor([[0.4, 0.7], [0.1, 0.1], [0.4, 0.7], [0.5, 0.1]]))
+
+
+def test_find_latest_checkpoint():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = tmpdir
+        latest = find_latest_checkpoint(path)
+        # There are no checkpoints in the path.
+        assert latest is None
+
+        path = osp.join(tmpdir, 'none')
+        latest = find_latest_checkpoint(path)
+        # The path does not exist.
+        assert latest is None
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(osp.join(tmpdir, 'latest.pth'), 'w') as f:
+            f.write('latest')
+        path = tmpdir
+        latest = find_latest_checkpoint(path)
+        assert latest == osp.join(tmpdir, 'latest.pth')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(osp.join(tmpdir, 'iter_4000.pth'), 'w') as f:
+            f.write('iter_4000')
+        with open(osp.join(tmpdir, 'iter_8000.pth'), 'w') as f:
+            f.write('iter_8000')
+        path = tmpdir
+        latest = find_latest_checkpoint(path)
+        assert latest == osp.join(tmpdir, 'iter_8000.pth')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(osp.join(tmpdir, 'epoch_1.pth'), 'w') as f:
+            f.write('epoch_1')
+        with open(osp.join(tmpdir, 'epoch_2.pth'), 'w') as f:
+            f.write('epoch_2')
+        path = tmpdir
+        latest = find_latest_checkpoint(path)
+        assert latest == osp.join(tmpdir, 'epoch_2.pth')
diff --git a/tests/test_utils/test_nms.py b/tests/test_utils/test_nms.py
new file mode 100755
index 0000000..5fa92dc
--- /dev/null
+++ b/tests/test_utils/test_nms.py
@@ -0,0 +1,75 @@
+import pytest
+import torch
+
+from mmdet.core.post_processing import mask_matrix_nms
+
+
+def _create_mask(N, h, w):
+    masks = torch.rand((N, h, w)) > 0.5
+    labels = torch.rand(N)
+    scores = torch.rand(N)
+    return masks, labels, scores
+
+
+def test_nms_input_errors():
+    with pytest.raises(AssertionError):
+        mask_matrix_nms(
+            torch.rand((10, 28, 28)), torch.rand(11), torch.rand(11))
+    with pytest.raises(AssertionError):
+        masks = torch.rand((10, 28, 28))
+        mask_matrix_nms(
+            masks,
+            torch.rand(11),
+            torch.rand(11),
+            mask_area=masks.sum((1, 2)).float()[:8])
+    with pytest.raises(NotImplementedError):
+        mask_matrix_nms(
+            torch.rand((10, 28, 28)),
+            torch.rand(10),
+            torch.rand(10),
+            kernel='None')
+    # test an empty results
+    masks, labels, scores = _create_mask(0, 28, 28)
+    score, label, mask, keep_ind = \
+        mask_matrix_nms(masks, labels, scores)
+    assert len(score) == len(label) == \
+           len(mask) == len(keep_ind) == 0
+
+    # do not use update_thr, nms_pre and max_num
+    masks, labels, scores = _create_mask(1000, 28, 28)
+    score, label, mask, keep_ind = \
+        mask_matrix_nms(masks, labels, scores)
+    assert len(score) == len(label) == \
+           len(mask) == len(keep_ind) == 1000
+    # only use nms_pre
+    score, label, mask, keep_ind = \
+        mask_matrix_nms(masks, labels, scores, nms_pre=500)
+    assert len(score) == len(label) == \
+           len(mask) == len(keep_ind) == 500
+    # use max_num
+    score, label, mask, keep_ind = \
+        mask_matrix_nms(masks, labels, scores,
+                        nms_pre=500, max_num=100)
+    assert len(score) == len(label) == \
+           len(mask) == len(keep_ind) == 100
+
+    masks, labels, _ = _create_mask(1, 28, 28)
+    scores = torch.Tensor([1.0])
+    masks = masks.expand(1000, 28, 28)
+    labels = labels.expand(1000)
+    scores = scores.expand(1000)
+
+    # assert scores is decayed and update_thr is worked
+    # if with the same mask, label, and all scores = 1
+    # the first score will set to 1, others will decay.
+    score, label, mask, keep_ind = \
+        mask_matrix_nms(masks,
+                        labels,
+                        scores,
+                        nms_pre=500,
+                        max_num=100,
+                        kernel='gaussian',
+                        sigma=2.0,
+                        filter_thr=0.5)
+    assert len(score) == 1
+    assert score[0] == 1
diff --git a/tests/test_utils/test_replace_cfg_vals.py b/tests/test_utils/test_replace_cfg_vals.py
new file mode 100755
index 0000000..85d9d0e
--- /dev/null
+++ b/tests/test_utils/test_replace_cfg_vals.py
@@ -0,0 +1,83 @@
+import os.path as osp
+import tempfile
+from copy import deepcopy
+
+import pytest
+from mmcv.utils import Config
+
+from mmdet.utils import replace_cfg_vals
+
+
+def test_replace_cfg_vals():
+    temp_file = tempfile.NamedTemporaryFile()
+    cfg_path = f'{temp_file.name}.py'
+    with open(cfg_path, 'w') as f:
+        f.write('configs')
+
+    ori_cfg_dict = dict()
+    ori_cfg_dict['cfg_name'] = osp.basename(temp_file.name)
+    ori_cfg_dict['work_dir'] = 'work_dirs/${cfg_name}/${percent}/${fold}'
+    ori_cfg_dict['percent'] = 5
+    ori_cfg_dict['fold'] = 1
+    ori_cfg_dict['model_wrapper'] = dict(
+        type='SoftTeacher', detector='${model}')
+    ori_cfg_dict['model'] = dict(
+        type='FasterRCNN',
+        backbone=dict(type='ResNet'),
+        neck=dict(type='FPN'),
+        rpn_head=dict(type='RPNHead'),
+        roi_head=dict(type='StandardRoIHead'),
+        train_cfg=dict(
+            rpn=dict(
+                assigner=dict(type='MaxIoUAssigner'),
+                sampler=dict(type='RandomSampler'),
+            ),
+            rpn_proposal=dict(nms=dict(type='nms', iou_threshold=0.7)),
+            rcnn=dict(
+                assigner=dict(type='MaxIoUAssigner'),
+                sampler=dict(type='RandomSampler'),
+            ),
+        ),
+        test_cfg=dict(
+            rpn=dict(nms=dict(type='nms', iou_threshold=0.7)),
+            rcnn=dict(nms=dict(type='nms', iou_threshold=0.5)),
+        ),
+    )
+    ori_cfg_dict['iou_threshold'] = dict(
+        rpn_proposal_nms='${model.train_cfg.rpn_proposal.nms.iou_threshold}',
+        test_rpn_nms='${model.test_cfg.rpn.nms.iou_threshold}',
+        test_rcnn_nms='${model.test_cfg.rcnn.nms.iou_threshold}',
+    )
+
+    ori_cfg_dict['str'] = 'Hello, world!'
+    ori_cfg_dict['dict'] = {'Hello': 'world!'}
+    ori_cfg_dict['list'] = [
+        'Hello, world!',
+    ]
+    ori_cfg_dict['tuple'] = ('Hello, world!', )
+    ori_cfg_dict['test_str'] = 'xxx${str}xxx'
+
+    ori_cfg = Config(ori_cfg_dict, filename=cfg_path)
+    updated_cfg = replace_cfg_vals(deepcopy(ori_cfg))
+
+    assert updated_cfg.work_dir \
+        == f'work_dirs/{osp.basename(temp_file.name)}/5/1'
+    assert updated_cfg.model.detector == ori_cfg.model
+    assert updated_cfg.iou_threshold.rpn_proposal_nms \
+        == ori_cfg.model.train_cfg.rpn_proposal.nms.iou_threshold
+    assert updated_cfg.test_str == 'xxxHello, world!xxx'
+    ori_cfg_dict['test_dict'] = 'xxx${dict}xxx'
+    ori_cfg_dict['test_list'] = 'xxx${list}xxx'
+    ori_cfg_dict['test_tuple'] = 'xxx${tuple}xxx'
+    with pytest.raises(AssertionError):
+        cfg = deepcopy(ori_cfg)
+        cfg['test_dict'] = 'xxx${dict}xxx'
+        updated_cfg = replace_cfg_vals(cfg)
+    with pytest.raises(AssertionError):
+        cfg = deepcopy(ori_cfg)
+        cfg['test_list'] = 'xxx${list}xxx'
+        updated_cfg = replace_cfg_vals(cfg)
+    with pytest.raises(AssertionError):
+        cfg = deepcopy(ori_cfg)
+        cfg['test_tuple'] = 'xxx${tuple}xxx'
+        updated_cfg = replace_cfg_vals(cfg)
diff --git a/tests/test_utils/test_setup_env.py b/tests/test_utils/test_setup_env.py
new file mode 100755
index 0000000..70f01b8
--- /dev/null
+++ b/tests/test_utils/test_setup_env.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import multiprocessing as mp
+import os
+import platform
+
+import cv2
+from mmcv import Config
+
+from mmdet.utils import setup_multi_processes
+
+
+def test_setup_multi_processes():
+    # temp save system setting
+    sys_start_mehod = mp.get_start_method(allow_none=True)
+    sys_cv_threads = cv2.getNumThreads()
+    # pop and temp save system env vars
+    sys_omp_threads = os.environ.pop('OMP_NUM_THREADS', default=None)
+    sys_mkl_threads = os.environ.pop('MKL_NUM_THREADS', default=None)
+
+    # test config without setting env
+    config = dict(data=dict(workers_per_gpu=2))
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert os.getenv('OMP_NUM_THREADS') == '1'
+    assert os.getenv('MKL_NUM_THREADS') == '1'
+    # when set to 0, the num threads will be 1
+    assert cv2.getNumThreads() == 1
+    if platform.system() != 'Windows':
+        assert mp.get_start_method() == 'fork'
+
+    # test num workers <= 1
+    os.environ.pop('OMP_NUM_THREADS')
+    os.environ.pop('MKL_NUM_THREADS')
+    config = dict(data=dict(workers_per_gpu=0))
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert 'OMP_NUM_THREADS' not in os.environ
+    assert 'MKL_NUM_THREADS' not in os.environ
+
+    # test manually set env var
+    os.environ['OMP_NUM_THREADS'] = '4'
+    config = dict(data=dict(workers_per_gpu=2))
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert os.getenv('OMP_NUM_THREADS') == '4'
+
+    # test manually set opencv threads and mp start method
+    config = dict(
+        data=dict(workers_per_gpu=2),
+        opencv_num_threads=4,
+        mp_start_method='spawn')
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert cv2.getNumThreads() == 4
+    assert mp.get_start_method() == 'spawn'
+
+    # revert setting to avoid affecting other programs
+    if sys_start_mehod:
+        mp.set_start_method(sys_start_mehod, force=True)
+    cv2.setNumThreads(sys_cv_threads)
+    if sys_omp_threads:
+        os.environ['OMP_NUM_THREADS'] = sys_omp_threads
+    else:
+        os.environ.pop('OMP_NUM_THREADS')
+    if sys_mkl_threads:
+        os.environ['MKL_NUM_THREADS'] = sys_mkl_threads
+    else:
+        os.environ.pop('MKL_NUM_THREADS')
diff --git a/tests/test_utils/test_split_batch.py b/tests/test_utils/test_split_batch.py
new file mode 100755
index 0000000..d770f9f
--- /dev/null
+++ b/tests/test_utils/test_split_batch.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from copy import deepcopy
+
+import mmcv
+import numpy as np
+import torch
+
+from mmdet.utils import split_batch
+
+
+def test_split_batch():
+    img_root = osp.join(osp.dirname(__file__), '../data/color.jpg')
+    img = mmcv.imread(img_root, 'color')
+    h, w, _ = img.shape
+    gt_bboxes = np.array([[0.2 * w, 0.2 * h, 0.4 * w, 0.4 * h],
+                          [0.6 * w, 0.6 * h, 0.8 * w, 0.8 * h]],
+                         dtype=np.float32)
+    gt_lables = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+
+    img = torch.tensor(img).permute(2, 0, 1)
+    meta = dict()
+    meta['filename'] = img_root
+    meta['ori_shape'] = img.shape
+    meta['img_shape'] = img.shape
+    meta['img_norm_cfg'] = {
+        'mean': np.array([103.53, 116.28, 123.675], dtype=np.float32),
+        'std': np.array([1., 1., 1.], dtype=np.float32),
+        'to_rgb': False
+    }
+    meta['pad_shape'] = img.shape
+    # For example, tag include sup, unsup_teacher and unsup_student,
+    # in order to distinguish the difference between the three groups of data,
+    # the scale_factor of sup is [0.5, 0.5, 0.5, 0.5]
+    # the scale_factor of unsup_teacher is [1.0, 1.0, 1.0, 1.0]
+    # the scale_factor of unsup_student is [2.0, 2.0, 2.0, 2.0]
+    imgs = img.unsqueeze(0).repeat(9, 1, 1, 1)
+    img_metas = []
+    tags = [
+        'sup', 'unsup_teacher', 'unsup_student', 'unsup_teacher',
+        'unsup_student', 'unsup_teacher', 'unsup_student', 'unsup_teacher',
+        'unsup_student'
+    ]
+    for tag in tags:
+        img_meta = deepcopy(meta)
+        if tag == 'sup':
+            img_meta['scale_factor'] = [0.5, 0.5, 0.5, 0.5]
+            img_meta['tag'] = 'sup'
+        elif tag == 'unsup_teacher':
+            img_meta['scale_factor'] = [1.0, 1.0, 1.0, 1.0]
+            img_meta['tag'] = 'unsup_teacher'
+        elif tag == 'unsup_student':
+            img_meta['scale_factor'] = [2.0, 2.0, 2.0, 2.0]
+            img_meta['tag'] = 'unsup_student'
+        else:
+            continue
+        img_metas.append(img_meta)
+    kwargs = dict()
+    kwargs['gt_bboxes'] = [torch.tensor(gt_bboxes)] + [torch.zeros(0, 4)] * 8
+    kwargs['gt_lables'] = [torch.tensor(gt_lables)] + [torch.zeros(0, )] * 8
+    data_groups = split_batch(imgs, img_metas, kwargs)
+    assert set(data_groups.keys()) == set(tags)
+    assert data_groups['sup']['img'].shape == (1, 3, h, w)
+    assert data_groups['unsup_teacher']['img'].shape == (4, 3, h, w)
+    assert data_groups['unsup_student']['img'].shape == (4, 3, h, w)
+    # the scale_factor of sup is [0.5, 0.5, 0.5, 0.5]
+    assert data_groups['sup']['img_metas'][0]['scale_factor'] == [
+        0.5, 0.5, 0.5, 0.5
+    ]
+    # the scale_factor of unsup_teacher is [1.0, 1.0, 1.0, 1.0]
+    assert data_groups['unsup_teacher']['img_metas'][0]['scale_factor'] == [
+        1.0, 1.0, 1.0, 1.0
+    ]
+    assert data_groups['unsup_teacher']['img_metas'][1]['scale_factor'] == [
+        1.0, 1.0, 1.0, 1.0
+    ]
+    assert data_groups['unsup_teacher']['img_metas'][2]['scale_factor'] == [
+        1.0, 1.0, 1.0, 1.0
+    ]
+    assert data_groups['unsup_teacher']['img_metas'][3]['scale_factor'] == [
+        1.0, 1.0, 1.0, 1.0
+    ]
+    # the scale_factor of unsup_student is [2.0, 2.0, 2.0, 2.0]
+    assert data_groups['unsup_student']['img_metas'][0]['scale_factor'] == [
+        2.0, 2.0, 2.0, 2.0
+    ]
+    assert data_groups['unsup_student']['img_metas'][1]['scale_factor'] == [
+        2.0, 2.0, 2.0, 2.0
+    ]
+    assert data_groups['unsup_student']['img_metas'][2]['scale_factor'] == [
+        2.0, 2.0, 2.0, 2.0
+    ]
+    assert data_groups['unsup_student']['img_metas'][3]['scale_factor'] == [
+        2.0, 2.0, 2.0, 2.0
+    ]
diff --git a/tests/test_utils/test_version.py b/tests/test_utils/test_version.py
new file mode 100755
index 0000000..87d2fab
--- /dev/null
+++ b/tests/test_utils/test_version.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet import digit_version
+
+
+def test_version_check():
+    assert digit_version('1.0.5') > digit_version('1.0.5rc0')
+    assert digit_version('1.0.5') > digit_version('1.0.4rc0')
+    assert digit_version('1.0.5') > digit_version('1.0rc0')
+    assert digit_version('1.0.0') > digit_version('0.6.2')
+    assert digit_version('1.0.0') > digit_version('0.2.16')
+    assert digit_version('1.0.5rc0') > digit_version('1.0.0rc0')
+    assert digit_version('1.0.0rc1') > digit_version('1.0.0rc0')
+    assert digit_version('1.0.0rc2') > digit_version('1.0.0rc0')
+    assert digit_version('1.0.0rc2') > digit_version('1.0.0rc1')
+    assert digit_version('1.0.1rc1') > digit_version('1.0.0rc1')
+    assert digit_version('1.0.0') > digit_version('1.0.0rc1')
diff --git a/tests/test_utils/test_visualization.py b/tests/test_utils/test_visualization.py
new file mode 100755
index 0000000..1dbdb2b
--- /dev/null
+++ b/tests/test_utils/test_visualization.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+
+import mmcv
+import numpy as np
+import pytest
+import torch
+
+from mmdet.core import visualization as vis
+from mmdet.datasets import (CityscapesDataset, CocoDataset,
+                            CocoPanopticDataset, VOCDataset)
+
+
+def test_color():
+    assert vis.color_val_matplotlib(mmcv.Color.blue) == (0., 0., 1.)
+    assert vis.color_val_matplotlib('green') == (0., 1., 0.)
+    assert vis.color_val_matplotlib((1, 2, 3)) == (3 / 255, 2 / 255, 1 / 255)
+    assert vis.color_val_matplotlib(100) == (100 / 255, 100 / 255, 100 / 255)
+    assert vis.color_val_matplotlib(np.zeros(3, dtype=np.int)) == (0., 0., 0.)
+    # forbid white color
+    with pytest.raises(TypeError):
+        vis.color_val_matplotlib([255, 255, 255])
+    # forbid float
+    with pytest.raises(TypeError):
+        vis.color_val_matplotlib(1.0)
+    # overflowed
+    with pytest.raises(AssertionError):
+        vis.color_val_matplotlib((0, 0, 500))
+
+
+def test_imshow_det_bboxes():
+    tmp_filename = osp.join(tempfile.gettempdir(), 'det_bboxes_image',
+                            'image.jpg')
+    image = np.ones((10, 10, 3), np.uint8)
+    bbox = np.array([[2, 1, 3, 3], [3, 4, 6, 6]])
+    label = np.array([0, 1])
+    out_image = vis.imshow_det_bboxes(
+        image, bbox, label, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    assert image.shape == out_image.shape
+    assert not np.allclose(image, out_image)
+    os.remove(tmp_filename)
+
+    # test grayscale images
+    image = np.ones((10, 10), np.uint8)
+    bbox = np.array([[2, 1, 3, 3], [3, 4, 6, 6]])
+    label = np.array([0, 1])
+    out_image = vis.imshow_det_bboxes(
+        image, bbox, label, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    assert image.shape == out_image.shape[:2]
+    os.remove(tmp_filename)
+
+    # test shaped (0,)
+    image = np.ones((10, 10, 3), np.uint8)
+    bbox = np.ones((0, 4))
+    label = np.ones((0, ))
+    vis.imshow_det_bboxes(
+        image, bbox, label, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    os.remove(tmp_filename)
+
+    # test mask
+    image = np.ones((10, 10, 3), np.uint8)
+    bbox = np.array([[2, 1, 3, 3], [3, 4, 6, 6]])
+    label = np.array([0, 1])
+    segms = np.random.random((2, 10, 10)) > 0.5
+    segms = np.array(segms, np.int32)
+    vis.imshow_det_bboxes(
+        image, bbox, label, segms, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    os.remove(tmp_filename)
+
+    # test tensor mask type error
+    with pytest.raises(AttributeError):
+        segms = torch.tensor(segms)
+        vis.imshow_det_bboxes(image, bbox, label, segms, show=False)
+
+
+def test_imshow_gt_det_bboxes():
+    tmp_filename = osp.join(tempfile.gettempdir(), 'det_bboxes_image',
+                            'image.jpg')
+    image = np.ones((10, 10, 3), np.uint8)
+    bbox = np.array([[2, 1, 3, 3], [3, 4, 6, 6]])
+    label = np.array([0, 1])
+    annotation = dict(gt_bboxes=bbox, gt_labels=label)
+    det_result = np.array([[2, 1, 3, 3, 0], [3, 4, 6, 6, 1]])
+    result = [det_result]
+    out_image = vis.imshow_gt_det_bboxes(
+        image, annotation, result, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    assert image.shape == out_image.shape
+    assert not np.allclose(image, out_image)
+    os.remove(tmp_filename)
+
+    # test grayscale images
+    image = np.ones((10, 10), np.uint8)
+    bbox = np.array([[2, 1, 3, 3], [3, 4, 6, 6]])
+    label = np.array([0, 1])
+    annotation = dict(gt_bboxes=bbox, gt_labels=label)
+    det_result = np.array([[2, 1, 3, 3, 0], [3, 4, 6, 6, 1]])
+    result = [det_result]
+    vis.imshow_gt_det_bboxes(
+        image, annotation, result, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    os.remove(tmp_filename)
+
+    # test numpy mask
+    gt_mask = np.ones((2, 10, 10))
+    annotation['gt_masks'] = gt_mask
+    vis.imshow_gt_det_bboxes(
+        image, annotation, result, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    os.remove(tmp_filename)
+
+    # test tensor mask
+    gt_mask = torch.ones((2, 10, 10))
+    annotation['gt_masks'] = gt_mask
+    vis.imshow_gt_det_bboxes(
+        image, annotation, result, out_file=tmp_filename, show=False)
+    assert osp.isfile(tmp_filename)
+    os.remove(tmp_filename)
+
+    # test unsupported type
+    annotation['gt_masks'] = []
+    with pytest.raises(TypeError):
+        vis.imshow_gt_det_bboxes(image, annotation, result, show=False)
+
+
+def test_palette():
+    assert vis.palette_val([(1, 2, 3)])[0] == (1 / 255, 2 / 255, 3 / 255)
+
+    # test list
+    palette = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
+    palette_ = vis.get_palette(palette, 3)
+    for color, color_ in zip(palette, palette_):
+        assert color == color_
+
+    # test tuple
+    palette = vis.get_palette((1, 2, 3), 3)
+    assert len(palette) == 3
+    for color in palette:
+        assert color == (1, 2, 3)
+
+    # test color str
+    palette = vis.get_palette('red', 3)
+    assert len(palette) == 3
+    for color in palette:
+        assert color == (255, 0, 0)
+
+    # test dataset str
+    palette = vis.get_palette('coco', len(CocoDataset.CLASSES))
+    assert len(palette) == len(CocoDataset.CLASSES)
+    assert palette[0] == (220, 20, 60)
+    palette = vis.get_palette('coco', len(CocoPanopticDataset.CLASSES))
+    assert len(palette) == len(CocoPanopticDataset.CLASSES)
+    assert palette[-1] == (250, 141, 255)
+    palette = vis.get_palette('voc', len(VOCDataset.CLASSES))
+    assert len(palette) == len(VOCDataset.CLASSES)
+    assert palette[0] == (106, 0, 228)
+    palette = vis.get_palette('citys', len(CityscapesDataset.CLASSES))
+    assert len(palette) == len(CityscapesDataset.CLASSES)
+    assert palette[0] == (220, 20, 60)
+
+    # test random
+    palette1 = vis.get_palette('random', 3)
+    palette2 = vis.get_palette(None, 3)
+    for color1, color2 in zip(palette1, palette2):
+        assert isinstance(color1, tuple)
+        assert isinstance(color2, tuple)
+        assert color1 == color2
diff --git a/tools/analysis_tools/analyze_logs.py b/tools/analysis_tools/analyze_logs.py
new file mode 100755
index 0000000..ca13ea8
--- /dev/null
+++ b/tools/analysis_tools/analyze_logs.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        if not all_times:
+            raise KeyError(
+                'Please reduce the log interval in the config so that'
+                'interval is less than iterations of one epoch.')
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[int(args.eval_interval) - 1]]:
+                if 'mAP' in metric:
+                    raise KeyError(
+                        f'{args.json_logs[i]} does not contain metric '
+                        f'{metric}. Please check if "--no-validate" is '
+                        'specified when you trained the model.')
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}. '
+                    'Please reduce the log interval in the config so that '
+                    'interval is less than iterations of one epoch.')
+
+            if 'mAP' in metric:
+                xs = []
+                ys = []
+                for epoch in epochs:
+                    ys += log_dict[epoch][metric]
+                    if 'val' in log_dict[epoch]['mode']:
+                        xs.append(epoch)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = log_dict[epochs[0]]['iter'][-2]
+                for epoch in epochs:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['bbox_mAP'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument(
+        '--start-epoch',
+        type=str,
+        default='1',
+        help='the epoch that you want to start')
+    parser_plt.add_argument(
+        '--eval-interval',
+        type=str,
+        default='1',
+        help='the eval interval when training')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for i, line in enumerate(log_file):
+                log = json.loads(line.strip())
+                # skip the first training info line
+                if i == 0:
+                    continue
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/analyze_results.py b/tools/analysis_tools/analyze_results.py
new file mode 100755
index 0000000..4d8b60c
--- /dev/null
+++ b/tools/analysis_tools/analyze_results.py
@@ -0,0 +1,369 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from multiprocessing import Pool
+
+import mmcv
+import numpy as np
+from mmcv import Config, DictAction
+
+from mmdet.core.evaluation import eval_map
+from mmdet.core.visualization import imshow_gt_det_bboxes
+from mmdet.datasets import build_dataset, get_loading_pipeline
+from mmdet.datasets.api_wrappers import pq_compute_single_core
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def bbox_map_eval(det_result, annotation, nproc=4):
+    """Evaluate mAP of single image det result.
+
+    Args:
+        det_result (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotation (dict): Ground truth annotations where keys of
+             annotations are:
+
+            - bboxes: numpy array of shape (n, 4)
+            - labels: numpy array of shape (n, )
+            - bboxes_ignore (optional): numpy array of shape (k, 4)
+            - labels_ignore (optional): numpy array of shape (k, )
+
+        nproc (int): Processes used for computing mAP.
+            Default: 4.
+
+    Returns:
+        float: mAP
+    """
+
+    # use only bbox det result
+    if isinstance(det_result, tuple):
+        bbox_det_result = [det_result[0]]
+    else:
+        bbox_det_result = [det_result]
+    # mAP
+    iou_thrs = np.linspace(
+        .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+
+    processes = []
+    workers = Pool(processes=nproc)
+    for thr in iou_thrs:
+        p = workers.apply_async(eval_map, (bbox_det_result, [annotation]), {
+            'iou_thr': thr,
+            'logger': 'silent',
+            'nproc': 1
+        })
+        processes.append(p)
+
+    workers.close()
+    workers.join()
+
+    mean_aps = []
+    for p in processes:
+        mean_aps.append(p.get()[0])
+
+    return sum(mean_aps) / len(mean_aps)
+
+
+class ResultVisualizer:
+    """Display and save evaluation results.
+
+    Args:
+        show (bool): Whether to show the image. Default: True.
+        wait_time (float): Value of waitKey param. Default: 0.
+        score_thr (float): Minimum score of bboxes to be shown.
+           Default: 0.
+        overlay_gt_pred (bool): Whether to plot gts and predictions on the
+            same image. If False, predictions and gts will be plotted on two
+            same image which will be concatenated in vertical direction.
+            The image above is drawn with gt, and the image below is drawn
+            with the prediction result. Default: False.
+    """
+
+    def __init__(self,
+                 show=False,
+                 wait_time=0,
+                 score_thr=0,
+                 overlay_gt_pred=False):
+        self.show = show
+        self.wait_time = wait_time
+        self.score_thr = score_thr
+        self.overlay_gt_pred = overlay_gt_pred
+
+    def _save_image_gts_results(self,
+                                dataset,
+                                results,
+                                performances,
+                                out_dir=None):
+        """Display or save image with groung truths and predictions from a
+        model.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Object detection or panoptic segmentation
+                results from test results pkl file.
+            performances (dict): A dict contains samples's indices
+                in dataset and model's performance on them.
+            out_dir (str, optional): The filename to write the image.
+                Defaults: None.
+        """
+        mmcv.mkdir_or_exist(out_dir)
+
+        for performance_info in performances:
+            index, performance = performance_info
+            data_info = dataset.prepare_train_img(index)
+
+            # calc save file path
+            filename = data_info['filename']
+            if data_info['img_prefix'] is not None:
+                filename = osp.join(data_info['img_prefix'], filename)
+            else:
+                filename = data_info['filename']
+            fname, name = osp.splitext(osp.basename(filename))
+            save_filename = fname + '_' + str(round(performance, 3)) + name
+            out_file = osp.join(out_dir, save_filename)
+            imshow_gt_det_bboxes(
+                data_info['img'],
+                data_info,
+                results[index],
+                dataset.CLASSES,
+                gt_bbox_color=dataset.PALETTE,
+                gt_text_color=(200, 200, 200),
+                gt_mask_color=dataset.PALETTE,
+                det_bbox_color=dataset.PALETTE,
+                det_text_color=(200, 200, 200),
+                det_mask_color=dataset.PALETTE,
+                show=self.show,
+                score_thr=self.score_thr,
+                wait_time=self.wait_time,
+                out_file=out_file,
+                overlay_gt_pred=self.overlay_gt_pred)
+
+    def evaluate_and_show(self,
+                          dataset,
+                          results,
+                          topk=20,
+                          show_dir='work_dir'):
+        """Evaluate and show results.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Object detection or panoptic segmentation
+                results from test results pkl file.
+            topk (int): Number of the highest topk and
+                lowest topk after evaluation index sorting. Default: 20.
+            show_dir (str, optional): The filename to write the image.
+                Default: 'work_dir'
+            eval_fn (callable, optional): Eval function, Default: None.
+        """
+
+        assert topk > 0
+        if (topk * 2) > len(dataset):
+            topk = len(dataset) // 2
+
+        if isinstance(results[0], dict):
+            good_samples, bad_samples = self.panoptic_evaluate(
+                dataset, results, topk=topk)
+        elif isinstance(results[0], list):
+            good_samples, bad_samples = self.detection_evaluate(
+                dataset, results, topk=topk)
+        elif isinstance(results[0], tuple):
+            results_ = [result[0] for result in results]
+            good_samples, bad_samples = self.detection_evaluate(
+                dataset, results_, topk=topk)
+        else:
+            raise 'The format of result is not supported yet. ' \
+                'Current dict for panoptic segmentation and list ' \
+                'or tuple for object detection are supported.'
+
+        good_dir = osp.abspath(osp.join(show_dir, 'good'))
+        bad_dir = osp.abspath(osp.join(show_dir, 'bad'))
+        self._save_image_gts_results(dataset, results, good_samples, good_dir)
+        self._save_image_gts_results(dataset, results, bad_samples, bad_dir)
+
+    def detection_evaluate(self, dataset, results, topk=20, eval_fn=None):
+        """Evaluation for object detection.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Object detection results from test
+                results pkl file.
+            topk (int): Number of the highest topk and
+                lowest topk after evaluation index sorting. Default: 20.
+            eval_fn (callable, optional): Eval function, Default: None.
+
+        Returns:
+            tuple: A tuple contains good samples and bad samples.
+                good_mAPs (dict[int, float]): A dict contains good
+                    samples's indices in dataset and model's
+                    performance on them.
+                bad_mAPs (dict[int, float]): A dict contains bad
+                    samples's indices in dataset and model's
+                    performance on them.
+        """
+        if eval_fn is None:
+            eval_fn = bbox_map_eval
+        else:
+            assert callable(eval_fn)
+
+        prog_bar = mmcv.ProgressBar(len(results))
+        _mAPs = {}
+        for i, (result, ) in enumerate(zip(results)):
+            # self.dataset[i] should not call directly
+            # because there is a risk of mismatch
+            data_info = dataset.prepare_train_img(i)
+            mAP = eval_fn(result, data_info['ann_info'])
+            _mAPs[i] = mAP
+            prog_bar.update()
+        # descending select topk image
+        _mAPs = list(sorted(_mAPs.items(), key=lambda kv: kv[1]))
+        good_mAPs = _mAPs[-topk:]
+        bad_mAPs = _mAPs[:topk]
+
+        return good_mAPs, bad_mAPs
+
+    def panoptic_evaluate(self, dataset, results, topk=20):
+        """Evaluation for panoptic segmentation.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Panoptic segmentation results from test
+                results pkl file.
+            topk (int): Number of the highest topk and
+                lowest topk after evaluation index sorting. Default: 20.
+
+        Returns:
+            tuple: A tuple contains good samples and bad samples.
+                good_pqs (dict[int, float]): A dict contains good
+                    samples's indices in dataset and model's
+                    performance on them.
+                bad_pqs (dict[int, float]): A dict contains bad
+                    samples's indices in dataset and model's
+                    performance on them.
+        """
+        # image to annotations
+        gt_json = dataset.coco.img_ann_map
+
+        result_files, tmp_dir = dataset.format_results(results)
+        pred_json = mmcv.load(result_files['panoptic'])['annotations']
+        pred_folder = osp.join(tmp_dir.name, 'panoptic')
+        gt_folder = dataset.seg_prefix
+
+        pqs = {}
+        prog_bar = mmcv.ProgressBar(len(results))
+        for i in range(len(results)):
+            data_info = dataset.prepare_train_img(i)
+            image_id = data_info['img_info']['id']
+            gt_ann = {
+                'image_id': image_id,
+                'segments_info': gt_json[image_id],
+                'file_name': data_info['img_info']['segm_file']
+            }
+            pred_ann = pred_json[i]
+            pq_stat = pq_compute_single_core(
+                i, [(gt_ann, pred_ann)],
+                gt_folder,
+                pred_folder,
+                dataset.categories,
+                dataset.file_client,
+                print_log=False)
+            pq_results, classwise_results = pq_stat.pq_average(
+                dataset.categories, isthing=None)
+            pqs[i] = pq_results['pq']
+            prog_bar.update()
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        # descending select topk image
+        pqs = list(sorted(pqs.items(), key=lambda kv: kv[1]))
+        good_pqs = pqs[-topk:]
+        bad_pqs = pqs[:topk]
+
+        return good_pqs, bad_pqs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet eval image prediction result for each')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'prediction_path', help='prediction path where test pkl result')
+    parser.add_argument(
+        'show_dir', help='directory where painted images will be saved')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=0,
+        help='the interval of show (s), 0 is block')
+    parser.add_argument(
+        '--topk',
+        default=20,
+        type=int,
+        help='saved Number of the highest topk '
+        'and lowest topk after index sorting')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0,
+        help='score threshold (default: 0.)')
+    parser.add_argument(
+        '--overlay-gt-pred',
+        action='store_true',
+        help='whether to plot gts and predictions on the same image.'
+        'If False, predictions and gts will be plotted on two same'
+        'image which will be concatenated in vertical direction.'
+        'The image above is drawn with gt, and the image below is'
+        'drawn with the prediction result.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    mmcv.check_file_exist(args.prediction_path)
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg.data.test.test_mode = True
+
+    cfg.data.test.pop('samples_per_gpu', 0)
+    if cfg.data.train.type in ('MultiImageMixDataset', 'ClassBalancedDataset',
+                               'RepeatDataset', 'ConcatDataset'):
+        cfg.data.test.pipeline = get_loading_pipeline(
+            cfg.data.train.dataset.pipeline)
+    else:
+        cfg.data.test.pipeline = get_loading_pipeline(cfg.data.train.pipeline)
+
+    dataset = build_dataset(cfg.data.test)
+    outputs = mmcv.load(args.prediction_path)
+
+    result_visualizer = ResultVisualizer(args.show, args.wait_time,
+                                         args.show_score_thr,
+                                         args.overlay_gt_pred)
+    result_visualizer.evaluate_and_show(
+        dataset, outputs, topk=args.topk, show_dir=args.show_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/benchmark.py b/tools/analysis_tools/benchmark.py
new file mode 100755
index 0000000..c956968
--- /dev/null
+++ b/tools/analysis_tools/benchmark.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import time
+
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDistributedDataParallel
+from mmcv.runner import init_dist, load_checkpoint, wrap_fp16_model
+
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.models import build_detector
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--repeat-num',
+        type=int,
+        default=1,
+        help='number of repeat times of measurement for averaging the results')
+    parser.add_argument(
+        '--max-iter', type=int, default=2000, help='num of max iter')
+    parser.add_argument(
+        '--log-interval', type=int, default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def measure_inference_speed(cfg, checkpoint, max_iter, log_interval,
+                            is_fuse_conv_bn):
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+    if samples_per_gpu > 1:
+        # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        # Because multiple processes will occupy additional CPU resources,
+        # FPS statistics will be more unstable when workers_per_gpu is not 0.
+        # It is reasonable to set workers_per_gpu to 0.
+        workers_per_gpu=0,
+        dist=True,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, checkpoint, map_location='cpu')
+    if is_fuse_conv_bn:
+        model = fuse_conv_bn(model)
+
+    model = MMDistributedDataParallel(
+        model.cuda(),
+        device_ids=[torch.cuda.current_device()],
+        broadcast_buffers=False)
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+    fps = 0
+
+    # benchmark with 2000 image and take the average
+    for i, data in enumerate(data_loader):
+
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(
+                    f'Done image [{i + 1:<3}/ {max_iter}], '
+                    f'fps: {fps:.1f} img / s, '
+                    f'times per image: {1000 / fps:.1f} ms / img',
+                    flush=True)
+
+        if (i + 1) == max_iter:
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(
+                f'Overall fps: {fps:.1f} img / s, '
+                f'times per image: {1000 / fps:.1f} ms / img',
+                flush=True)
+            break
+    return fps
+
+
+def repeat_measure_inference_speed(cfg,
+                                   checkpoint,
+                                   max_iter,
+                                   log_interval,
+                                   is_fuse_conv_bn,
+                                   repeat_num=1):
+    assert repeat_num >= 1
+
+    fps_list = []
+
+    for _ in range(repeat_num):
+        #
+        cp_cfg = copy.deepcopy(cfg)
+
+        fps_list.append(
+            measure_inference_speed(cp_cfg, checkpoint, max_iter, log_interval,
+                                    is_fuse_conv_bn))
+
+    if repeat_num > 1:
+        fps_list_ = [round(fps, 1) for fps in fps_list]
+        times_pre_image_list_ = [round(1000 / fps, 1) for fps in fps_list]
+        mean_fps_ = sum(fps_list_) / len(fps_list_)
+        mean_times_pre_image_ = sum(times_pre_image_list_) / len(
+            times_pre_image_list_)
+        print(
+            f'Overall fps: {fps_list_}[{mean_fps_:.1f}] img / s, '
+            f'times per image: '
+            f'{times_pre_image_list_}[{mean_times_pre_image_:.1f}] ms / img',
+            flush=True)
+        return fps_list
+
+    return fps_list[0]
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.launcher == 'none':
+        raise NotImplementedError('Only supports distributed mode')
+    else:
+        init_dist(args.launcher, **cfg.dist_params)
+
+    repeat_measure_inference_speed(cfg, args.checkpoint, args.max_iter,
+                                   args.log_interval, args.fuse_conv_bn,
+                                   args.repeat_num)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/coco_error_analysis.py b/tools/analysis_tools/coco_error_analysis.py
new file mode 100755
index 0000000..102ea4e
--- /dev/null
+++ b/tools/analysis_tools/coco_error_analysis.py
@@ -0,0 +1,339 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
+from argparse import ArgumentParser
+from multiprocessing import Pool
+
+import matplotlib.pyplot as plt
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+
+def makeplot(rs, ps, outDir, class_name, iou_type):
+    cs = np.vstack([
+        np.ones((2, 3)),
+        np.array([0.31, 0.51, 0.74]),
+        np.array([0.75, 0.31, 0.30]),
+        np.array([0.36, 0.90, 0.38]),
+        np.array([0.50, 0.39, 0.64]),
+        np.array([1, 0.6, 0]),
+    ])
+    areaNames = ['allarea', 'small', 'medium', 'large']
+    types = ['C75', 'C50', 'Loc', 'Sim', 'Oth', 'BG', 'FN']
+    for i in range(len(areaNames)):
+        area_ps = ps[..., i, 0]
+        figure_title = iou_type + '-' + class_name + '-' + areaNames[i]
+        aps = [ps_.mean() for ps_ in area_ps]
+        ps_curve = [
+            ps_.mean(axis=1) if ps_.ndim > 1 else ps_ for ps_ in area_ps
+        ]
+        ps_curve.insert(0, np.zeros(ps_curve[0].shape))
+        fig = plt.figure()
+        ax = plt.subplot(111)
+        for k in range(len(types)):
+            ax.plot(rs, ps_curve[k + 1], color=[0, 0, 0], linewidth=0.5)
+            ax.fill_between(
+                rs,
+                ps_curve[k],
+                ps_curve[k + 1],
+                color=cs[k],
+                label=str(f'[{aps[k]:.3f}]' + types[k]),
+            )
+        plt.xlabel('recall')
+        plt.ylabel('precision')
+        plt.xlim(0, 1.0)
+        plt.ylim(0, 1.0)
+        plt.title(figure_title)
+        plt.legend()
+        # plt.show()
+        fig.savefig(outDir + f'/{figure_title}.png')
+        plt.close(fig)
+
+
+def autolabel(ax, rects):
+    """Attach a text label above each bar in *rects*, displaying its height."""
+    for rect in rects:
+        height = rect.get_height()
+        if height > 0 and height <= 1:  # for percent values
+            text_label = '{:2.0f}'.format(height * 100)
+        else:
+            text_label = '{:2.0f}'.format(height)
+        ax.annotate(
+            text_label,
+            xy=(rect.get_x() + rect.get_width() / 2, height),
+            xytext=(0, 3),  # 3 points vertical offset
+            textcoords='offset points',
+            ha='center',
+            va='bottom',
+            fontsize='x-small',
+        )
+
+
+def makebarplot(rs, ps, outDir, class_name, iou_type):
+    areaNames = ['allarea', 'small', 'medium', 'large']
+    types = ['C75', 'C50', 'Loc', 'Sim', 'Oth', 'BG', 'FN']
+    fig, ax = plt.subplots()
+    x = np.arange(len(areaNames))  # the areaNames locations
+    width = 0.60  # the width of the bars
+    rects_list = []
+    figure_title = iou_type + '-' + class_name + '-' + 'ap bar plot'
+    for i in range(len(types) - 1):
+        type_ps = ps[i, ..., 0]
+        aps = [ps_.mean() for ps_ in type_ps.T]
+        rects_list.append(
+            ax.bar(
+                x - width / 2 + (i + 1) * width / len(types),
+                aps,
+                width / len(types),
+                label=types[i],
+            ))
+
+    # Add some text for labels, title and custom x-axis tick labels, etc.
+    ax.set_ylabel('Mean Average Precision (mAP)')
+    ax.set_title(figure_title)
+    ax.set_xticks(x)
+    ax.set_xticklabels(areaNames)
+    ax.legend()
+
+    # Add score texts over bars
+    for rects in rects_list:
+        autolabel(ax, rects)
+
+    # Save plot
+    fig.savefig(outDir + f'/{figure_title}.png')
+    plt.close(fig)
+
+
+def get_gt_area_group_numbers(cocoEval):
+    areaRng = cocoEval.params.areaRng
+    areaRngStr = [str(aRng) for aRng in areaRng]
+    areaRngLbl = cocoEval.params.areaRngLbl
+    areaRngStr2areaRngLbl = dict(zip(areaRngStr, areaRngLbl))
+    areaRngLbl2Number = dict.fromkeys(areaRngLbl, 0)
+    for evalImg in cocoEval.evalImgs:
+        if evalImg:
+            for gtIgnore in evalImg['gtIgnore']:
+                if not gtIgnore:
+                    aRngLbl = areaRngStr2areaRngLbl[str(evalImg['aRng'])]
+                    areaRngLbl2Number[aRngLbl] += 1
+    return areaRngLbl2Number
+
+
+def make_gt_area_group_numbers_plot(cocoEval, outDir, verbose=True):
+    areaRngLbl2Number = get_gt_area_group_numbers(cocoEval)
+    areaRngLbl = areaRngLbl2Number.keys()
+    if verbose:
+        print('number of annotations per area group:', areaRngLbl2Number)
+
+    # Init figure
+    fig, ax = plt.subplots()
+    x = np.arange(len(areaRngLbl))  # the areaNames locations
+    width = 0.60  # the width of the bars
+    figure_title = 'number of annotations per area group'
+
+    rects = ax.bar(x, areaRngLbl2Number.values(), width)
+
+    # Add some text for labels, title and custom x-axis tick labels, etc.
+    ax.set_ylabel('Number of annotations')
+    ax.set_title(figure_title)
+    ax.set_xticks(x)
+    ax.set_xticklabels(areaRngLbl)
+
+    # Add score texts over bars
+    autolabel(ax, rects)
+
+    # Save plot
+    fig.tight_layout()
+    fig.savefig(outDir + f'/{figure_title}.png')
+    plt.close(fig)
+
+
+def make_gt_area_histogram_plot(cocoEval, outDir):
+    n_bins = 100
+    areas = [ann['area'] for ann in cocoEval.cocoGt.anns.values()]
+
+    # init figure
+    figure_title = 'gt annotation areas histogram plot'
+    fig, ax = plt.subplots()
+
+    # Set the number of bins
+    ax.hist(np.sqrt(areas), bins=n_bins)
+
+    # Add some text for labels, title and custom x-axis tick labels, etc.
+    ax.set_xlabel('Squareroot Area')
+    ax.set_ylabel('Number of annotations')
+    ax.set_title(figure_title)
+
+    # Save plot
+    fig.tight_layout()
+    fig.savefig(outDir + f'/{figure_title}.png')
+    plt.close(fig)
+
+
+def analyze_individual_category(k,
+                                cocoDt,
+                                cocoGt,
+                                catId,
+                                iou_type,
+                                areas=None):
+    nm = cocoGt.loadCats(catId)[0]
+    print(f'--------------analyzing {k + 1}-{nm["name"]}---------------')
+    ps_ = {}
+    dt = copy.deepcopy(cocoDt)
+    nm = cocoGt.loadCats(catId)[0]
+    imgIds = cocoGt.getImgIds()
+    dt_anns = dt.dataset['annotations']
+    select_dt_anns = []
+    for ann in dt_anns:
+        if ann['category_id'] == catId:
+            select_dt_anns.append(ann)
+    dt.dataset['annotations'] = select_dt_anns
+    dt.createIndex()
+    # compute precision but ignore superclass confusion
+    gt = copy.deepcopy(cocoGt)
+    child_catIds = gt.getCatIds(supNms=[nm['supercategory']])
+    for idx, ann in enumerate(gt.dataset['annotations']):
+        if ann['category_id'] in child_catIds and ann['category_id'] != catId:
+            gt.dataset['annotations'][idx]['ignore'] = 1
+            gt.dataset['annotations'][idx]['iscrowd'] = 1
+            gt.dataset['annotations'][idx]['category_id'] = catId
+    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
+    cocoEval.params.imgIds = imgIds
+    cocoEval.params.maxDets = [100]
+    cocoEval.params.iouThrs = [0.1]
+    cocoEval.params.useCats = 1
+    if areas:
+        cocoEval.params.areaRng = [[0**2, areas[2]], [0**2, areas[0]],
+                                   [areas[0], areas[1]], [areas[1], areas[2]]]
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    ps_supercategory = cocoEval.eval['precision'][0, :, k, :, :]
+    ps_['ps_supercategory'] = ps_supercategory
+    # compute precision but ignore any class confusion
+    gt = copy.deepcopy(cocoGt)
+    for idx, ann in enumerate(gt.dataset['annotations']):
+        if ann['category_id'] != catId:
+            gt.dataset['annotations'][idx]['ignore'] = 1
+            gt.dataset['annotations'][idx]['iscrowd'] = 1
+            gt.dataset['annotations'][idx]['category_id'] = catId
+    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
+    cocoEval.params.imgIds = imgIds
+    cocoEval.params.maxDets = [100]
+    cocoEval.params.iouThrs = [0.1]
+    cocoEval.params.useCats = 1
+    if areas:
+        cocoEval.params.areaRng = [[0**2, areas[2]], [0**2, areas[0]],
+                                   [areas[0], areas[1]], [areas[1], areas[2]]]
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    ps_allcategory = cocoEval.eval['precision'][0, :, k, :, :]
+    ps_['ps_allcategory'] = ps_allcategory
+    return k, ps_
+
+
+def analyze_results(res_file,
+                    ann_file,
+                    res_types,
+                    out_dir,
+                    extraplots=None,
+                    areas=None):
+    for res_type in res_types:
+        assert res_type in ['bbox', 'segm']
+    if areas:
+        assert len(areas) == 3, '3 integers should be specified as areas, \
+            representing 3 area regions'
+
+    directory = os.path.dirname(out_dir + '/')
+    if not os.path.exists(directory):
+        print(f'-------------create {out_dir}-----------------')
+        os.makedirs(directory)
+
+    cocoGt = COCO(ann_file)
+    cocoDt = cocoGt.loadRes(res_file)
+    imgIds = cocoGt.getImgIds()
+    for res_type in res_types:
+        res_out_dir = out_dir + '/' + res_type + '/'
+        res_directory = os.path.dirname(res_out_dir)
+        if not os.path.exists(res_directory):
+            print(f'-------------create {res_out_dir}-----------------')
+            os.makedirs(res_directory)
+        iou_type = res_type
+        cocoEval = COCOeval(
+            copy.deepcopy(cocoGt), copy.deepcopy(cocoDt), iou_type)
+        cocoEval.params.imgIds = imgIds
+        cocoEval.params.iouThrs = [0.75, 0.5, 0.1]
+        cocoEval.params.maxDets = [100]
+        if areas:
+            cocoEval.params.areaRng = [[0**2, areas[2]], [0**2, areas[0]],
+                                       [areas[0], areas[1]],
+                                       [areas[1], areas[2]]]
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        ps = cocoEval.eval['precision']
+        ps = np.vstack([ps, np.zeros((4, *ps.shape[1:]))])
+        catIds = cocoGt.getCatIds()
+        recThrs = cocoEval.params.recThrs
+        with Pool(processes=48) as pool:
+            args = [(k, cocoDt, cocoGt, catId, iou_type, areas)
+                    for k, catId in enumerate(catIds)]
+            analyze_results = pool.starmap(analyze_individual_category, args)
+        for k, catId in enumerate(catIds):
+            nm = cocoGt.loadCats(catId)[0]
+            print(f'--------------saving {k + 1}-{nm["name"]}---------------')
+            analyze_result = analyze_results[k]
+            assert k == analyze_result[0]
+            ps_supercategory = analyze_result[1]['ps_supercategory']
+            ps_allcategory = analyze_result[1]['ps_allcategory']
+            # compute precision but ignore superclass confusion
+            ps[3, :, k, :, :] = ps_supercategory
+            # compute precision but ignore any class confusion
+            ps[4, :, k, :, :] = ps_allcategory
+            # fill in background and false negative errors and plot
+            ps[ps == -1] = 0
+            ps[5, :, k, :, :] = ps[4, :, k, :, :] > 0
+            ps[6, :, k, :, :] = 1.0
+            makeplot(recThrs, ps[:, :, k], res_out_dir, nm['name'], iou_type)
+            if extraplots:
+                makebarplot(recThrs, ps[:, :, k], res_out_dir, nm['name'],
+                            iou_type)
+        makeplot(recThrs, ps, res_out_dir, 'allclass', iou_type)
+        if extraplots:
+            makebarplot(recThrs, ps, res_out_dir, 'allclass', iou_type)
+            make_gt_area_group_numbers_plot(
+                cocoEval=cocoEval, outDir=res_out_dir, verbose=True)
+            make_gt_area_histogram_plot(cocoEval=cocoEval, outDir=res_out_dir)
+
+
+def main():
+    parser = ArgumentParser(description='COCO Error Analysis Tool')
+    parser.add_argument('result', help='result file (json format) path')
+    parser.add_argument('out_dir', help='dir to save analyze result images')
+    parser.add_argument(
+        '--ann',
+        default='data/coco/annotations/instances_val2017.json',
+        help='annotation file path')
+    parser.add_argument(
+        '--types', type=str, nargs='+', default=['bbox'], help='result types')
+    parser.add_argument(
+        '--extraplots',
+        action='store_true',
+        help='export extra bar/stat plots')
+    parser.add_argument(
+        '--areas',
+        type=int,
+        nargs='+',
+        default=[1024, 9216, 10000000000],
+        help='area regions')
+    args = parser.parse_args()
+    analyze_results(
+        args.result,
+        args.ann,
+        args.types,
+        out_dir=args.out_dir,
+        extraplots=args.extraplots,
+        areas=args.areas)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/coco_occluded_separated_recall.py b/tools/analysis_tools/coco_occluded_separated_recall.py
new file mode 100755
index 0000000..cbc0ee2
--- /dev/null
+++ b/tools/analysis_tools/coco_occluded_separated_recall.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+import mmcv
+from mmcv.utils import print_log
+
+from mmdet.datasets import OccludedSeparatedCocoDataset
+
+
+def main():
+    parser = ArgumentParser(
+        description='Compute recall of COCO occluded and separated masks '
+        'presented in paper https://arxiv.org/abs/2210.10046.')
+    parser.add_argument('result', help='result file (pkl format) path')
+    parser.add_argument('--out', help='file path to save evaluation results')
+    parser.add_argument(
+        '--score-thr',
+        type=float,
+        default=0.3,
+        help='Score threshold for the recall calculation. Defaults to 0.3')
+    parser.add_argument(
+        '--iou-thr',
+        type=float,
+        default=0.75,
+        help='IoU threshold for the recall calculation. Defaults to 0.75.')
+    parser.add_argument(
+        '--ann',
+        default='data/coco/annotations/instances_val2017.json',
+        help='coco annotation file path')
+    args = parser.parse_args()
+
+    results = mmcv.load(args.result)
+    assert isinstance(results[0], tuple), \
+        'The results must be predicted by instance segmentation model.'
+    dataset = OccludedSeparatedCocoDataset(
+        ann_file=args.ann, pipeline=[], test_mode=True)
+    metric_res = dataset.evaluate(results)
+    if args.out is not None:
+        mmcv.dump(metric_res, args.out)
+        print_log(f'Evaluation results have been saved to {args.out}.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
new file mode 100755
index 0000000..5b52ea4
--- /dev/null
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -0,0 +1,273 @@
+import argparse
+import os
+
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+from matplotlib.ticker import MultipleLocator
+from mmcv import Config, DictAction
+from mmcv.ops import nms
+
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.datasets import build_dataset
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate confusion matrix from detection results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'prediction_path', help='prediction path where test .pkl result')
+    parser.add_argument(
+        'save_dir', help='directory where confusion matrix will be saved')
+    parser.add_argument(
+        '--show', action='store_true', help='show confusion matrix')
+    parser.add_argument(
+        '--color-theme',
+        default='plasma',
+        help='theme of the matrix color map')
+    parser.add_argument(
+        '--score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold to filter detection bboxes')
+    parser.add_argument(
+        '--tp-iou-thr',
+        type=float,
+        default=0.5,
+        help='IoU threshold to be considered as matched')
+    parser.add_argument(
+        '--nms-iou-thr',
+        type=float,
+        default=None,
+        help='nms IoU threshold, only applied when users want to change the'
+        'nms IoU threshold.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def calculate_confusion_matrix(dataset,
+                               results,
+                               score_thr=0,
+                               nms_iou_thr=None,
+                               tp_iou_thr=0.5):
+    """Calculate the confusion matrix.
+
+    Args:
+        dataset (Dataset): Test or val dataset.
+        results (list[ndarray]): A list of detection results in each image.
+        score_thr (float|optional): Score threshold to filter bboxes.
+            Default: 0.
+        nms_iou_thr (float|optional): nms IoU threshold, the detection results
+            have done nms in the detector, only applied when users want to
+            change the nms IoU threshold. Default: None.
+        tp_iou_thr (float|optional): IoU threshold to be considered as matched.
+            Default: 0.5.
+    """
+    num_classes = len(dataset.CLASSES)
+    confusion_matrix = np.zeros(shape=[num_classes + 1, num_classes + 1])
+    assert len(dataset) == len(results)
+    prog_bar = mmcv.ProgressBar(len(results))
+    for idx, per_img_res in enumerate(results):
+        if isinstance(per_img_res, tuple):
+            res_bboxes, _ = per_img_res
+        else:
+            res_bboxes = per_img_res
+        ann = dataset.get_ann_info(idx)
+        gt_bboxes = ann['bboxes']
+        labels = ann['labels']
+        analyze_per_img_dets(confusion_matrix, gt_bboxes, labels, res_bboxes,
+                             score_thr, tp_iou_thr, nms_iou_thr)
+        prog_bar.update()
+    return confusion_matrix
+
+
+def analyze_per_img_dets(confusion_matrix,
+                         gt_bboxes,
+                         gt_labels,
+                         result,
+                         score_thr=0,
+                         tp_iou_thr=0.5,
+                         nms_iou_thr=None):
+    """Analyze detection results on each image.
+
+    Args:
+        confusion_matrix (ndarray): The confusion matrix,
+            has shape (num_classes + 1, num_classes + 1).
+        gt_bboxes (ndarray): Ground truth bboxes, has shape (num_gt, 4).
+        gt_labels (ndarray): Ground truth labels, has shape (num_gt).
+        result (ndarray): Detection results, has shape
+            (num_classes, num_bboxes, 5).
+        score_thr (float): Score threshold to filter bboxes.
+            Default: 0.
+        tp_iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        nms_iou_thr (float|optional): nms IoU threshold, the detection results
+            have done nms in the detector, only applied when users want to
+            change the nms IoU threshold. Default: None.
+    """
+    true_positives = np.zeros_like(gt_labels)
+    for det_label, det_bboxes in enumerate(result):
+        if nms_iou_thr:
+            det_bboxes, _ = nms(
+                det_bboxes[:, :4],
+                det_bboxes[:, -1],
+                nms_iou_thr,
+                score_threshold=score_thr)
+        ious = bbox_overlaps(det_bboxes[:, :4], gt_bboxes)
+        for i, det_bbox in enumerate(det_bboxes):
+            score = det_bbox[4]
+            det_match = 0
+            if score >= score_thr:
+                for j, gt_label in enumerate(gt_labels):
+                    if ious[i, j] >= tp_iou_thr:
+                        det_match += 1
+                        if gt_label == det_label:
+                            true_positives[j] += 1  # TP
+                        confusion_matrix[gt_label, det_label] += 1
+                if det_match == 0:  # BG FP
+                    confusion_matrix[-1, det_label] += 1
+    for num_tp, gt_label in zip(true_positives, gt_labels):
+        if num_tp == 0:  # FN
+            confusion_matrix[gt_label, -1] += 1
+
+
+def plot_confusion_matrix(confusion_matrix,
+                          labels,
+                          save_dir=None,
+                          show=True,
+                          title='Normalized Confusion Matrix',
+                          color_theme='plasma'):
+    """Draw confusion matrix with matplotlib.
+
+    Args:
+        confusion_matrix (ndarray): The confusion matrix.
+        labels (list[str]): List of class names.
+        save_dir (str|optional): If set, save the confusion matrix plot to the
+            given path. Default: None.
+        show (bool): Whether to show the plot. Default: True.
+        title (str): Title of the plot. Default: `Normalized Confusion Matrix`.
+        color_theme (str): Theme of the matrix color map. Default: `plasma`.
+    """
+    # normalize the confusion matrix
+    per_label_sums = confusion_matrix.sum(axis=1)[:, np.newaxis]
+    confusion_matrix = \
+        confusion_matrix.astype(np.float32) / per_label_sums * 100
+
+    num_classes = len(labels)
+    fig, ax = plt.subplots(
+        figsize=(0.5 * num_classes, 0.5 * num_classes * 0.8), dpi=180)
+    cmap = plt.get_cmap(color_theme)
+    im = ax.imshow(confusion_matrix, cmap=cmap)
+    plt.colorbar(mappable=im, ax=ax)
+
+    title_font = {'weight': 'bold', 'size': 12}
+    ax.set_title(title, fontdict=title_font)
+    label_font = {'size': 10}
+    plt.ylabel('Ground Truth Label', fontdict=label_font)
+    plt.xlabel('Prediction Label', fontdict=label_font)
+
+    # draw locator
+    xmajor_locator = MultipleLocator(1)
+    xminor_locator = MultipleLocator(0.5)
+    ax.xaxis.set_major_locator(xmajor_locator)
+    ax.xaxis.set_minor_locator(xminor_locator)
+    ymajor_locator = MultipleLocator(1)
+    yminor_locator = MultipleLocator(0.5)
+    ax.yaxis.set_major_locator(ymajor_locator)
+    ax.yaxis.set_minor_locator(yminor_locator)
+
+    # draw grid
+    ax.grid(True, which='minor', linestyle='-')
+
+    # draw label
+    ax.set_xticks(np.arange(num_classes))
+    ax.set_yticks(np.arange(num_classes))
+    ax.set_xticklabels(labels)
+    ax.set_yticklabels(labels)
+
+    ax.tick_params(
+        axis='x', bottom=False, top=True, labelbottom=False, labeltop=True)
+    plt.setp(
+        ax.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor')
+
+    # draw confution matrix value
+    for i in range(num_classes):
+        for j in range(num_classes):
+            ax.text(
+                j,
+                i,
+                '{}%'.format(
+                    int(confusion_matrix[
+                        i,
+                        j]) if not np.isnan(confusion_matrix[i, j]) else -1),
+                ha='center',
+                va='center',
+                color='w',
+                size=7)
+
+    ax.set_ylim(len(confusion_matrix) - 0.5, -0.5)  # matplotlib>3.1.1
+
+    fig.tight_layout()
+    if save_dir is not None:
+        plt.savefig(
+            os.path.join(save_dir, 'confusion_matrix.png'), format='png')
+    if show:
+        plt.show()
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    results = mmcv.load(args.prediction_path)
+    assert isinstance(results, list)
+    if isinstance(results[0], list):
+        pass
+    elif isinstance(results[0], tuple):
+        results = [result[0] for result in results]
+    else:
+        raise TypeError('invalid type of prediction results')
+
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+    dataset = build_dataset(cfg.data.test)
+
+    confusion_matrix = calculate_confusion_matrix(dataset, results,
+                                                  args.score_thr,
+                                                  args.nms_iou_thr,
+                                                  args.tp_iou_thr)
+    plot_confusion_matrix(
+        confusion_matrix,
+        dataset.CLASSES + ('background', ),
+        save_dir=args.save_dir,
+        show=args.show,
+        color_theme=args.color_theme)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/eval_metric.py b/tools/analysis_tools/eval_metric.py
new file mode 100755
index 0000000..7caafe9
--- /dev/null
+++ b/tools/analysis_tools/eval_metric.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmcv
+from mmcv import Config, DictAction
+
+from mmdet.datasets import build_dataset
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Evaluate metric of the '
+                                     'results saved in pkl format')
+    parser.add_argument('config', help='Config of the model')
+    parser.add_argument('pkl_results', help='Results in pickle format')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='Evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    assert args.eval or args.format_only, (
+        'Please specify at least one operation (eval/format the results) with '
+        'the argument "--eval", "--format-only"')
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg.data.test.test_mode = True
+
+    dataset = build_dataset(cfg.data.test)
+    outputs = mmcv.load(args.pkl_results)
+
+    kwargs = {} if args.eval_options is None else args.eval_options
+    if args.format_only:
+        dataset.format_results(outputs, **kwargs)
+    if args.eval:
+        eval_kwargs = cfg.get('evaluation', {}).copy()
+        # hard-code way to remove EvalHook args
+        for key in [
+                'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                'rule'
+        ]:
+            eval_kwargs.pop(key, None)
+        eval_kwargs.update(dict(metric=args.eval, **kwargs))
+        print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
new file mode 100755
index 0000000..4df8732
--- /dev/null
+++ b/tools/analysis_tools/get_flops.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import numpy as np
+import torch
+from mmcv import Config, DictAction
+
+from mmdet.models import build_detector
+
+try:
+    from mmcv.cnn import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[1280, 800],
+        help='input image size')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--size-divisor',
+        type=int,
+        default=32,
+        help='Pad the input image, the minimum size that is divisible '
+        'by size_divisor, -1 means do not pad the image.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+
+    if len(args.shape) == 1:
+        h = w = args.shape[0]
+    elif len(args.shape) == 2:
+        h, w = args.shape
+    else:
+        raise ValueError('invalid input shape')
+    ori_shape = (3, h, w)
+    divisor = args.size_divisor
+    if divisor > 0:
+        h = int(np.ceil(h / divisor)) * divisor
+        w = int(np.ceil(w / divisor)) * divisor
+
+    input_shape = (3, h, w)
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    model = build_detector(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+
+    flops, params = get_model_complexity_info(model, input_shape)
+    split_line = '=' * 30
+
+    if divisor > 0 and \
+            input_shape != ori_shape:
+        print(f'{split_line}\nUse size divisor set input shape '
+              f'from {ori_shape} to {input_shape}\n')
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/optimize_anchors.py b/tools/analysis_tools/optimize_anchors.py
new file mode 100755
index 0000000..421998f
--- /dev/null
+++ b/tools/analysis_tools/optimize_anchors.py
@@ -0,0 +1,376 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Optimize anchor settings on a specific dataset.
+
+This script provides two method to optimize YOLO anchors including k-means
+anchor cluster and differential evolution. You can use ``--algorithm k-means``
+and ``--algorithm differential_evolution`` to switch two method.
+
+Example:
+    Use k-means anchor cluster::
+
+        python tools/analysis_tools/optimize_anchors.py ${CONFIG} \
+        --algorithm k-means --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \
+        --output-dir ${OUTPUT_DIR}
+    Use differential evolution to optimize anchors::
+
+        python tools/analysis_tools/optimize_anchors.py ${CONFIG} \
+        --algorithm differential_evolution \
+        --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \
+        --output-dir ${OUTPUT_DIR}
+"""
+import argparse
+import os.path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv import Config
+from scipy.optimize import differential_evolution
+
+from mmdet.core import bbox_cxcywh_to_xyxy, bbox_overlaps, bbox_xyxy_to_cxcywh
+from mmdet.datasets import build_dataset
+from mmdet.utils import get_root_logger, replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Optimize anchor parameters.')
+    parser.add_argument('config', help='Train config file path.')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for calculating.')
+    parser.add_argument(
+        '--input-shape',
+        type=int,
+        nargs='+',
+        default=[608, 608],
+        help='input image size')
+    parser.add_argument(
+        '--algorithm',
+        default='differential_evolution',
+        help='Algorithm used for anchor optimizing.'
+        'Support k-means and differential_evolution for YOLO.')
+    parser.add_argument(
+        '--iters',
+        default=1000,
+        type=int,
+        help='Maximum iterations for optimizer.')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='Path to save anchor optimize result.')
+
+    args = parser.parse_args()
+    return args
+
+
+class BaseAnchorOptimizer:
+    """Base class for anchor optimizer.
+
+    Args:
+        dataset (obj:`Dataset`): Dataset object.
+        input_shape (list[int]): Input image shape of the model.
+            Format in [width, height].
+        logger (obj:`logging.Logger`): The logger for logging.
+        device (str, optional): Device used for calculating.
+            Default: 'cuda:0'
+        out_dir (str, optional): Path to save anchor optimize result.
+            Default: None
+    """
+
+    def __init__(self,
+                 dataset,
+                 input_shape,
+                 logger,
+                 device='cuda:0',
+                 out_dir=None):
+        self.dataset = dataset
+        self.input_shape = input_shape
+        self.logger = logger
+        self.device = device
+        self.out_dir = out_dir
+        bbox_whs, img_shapes = self.get_whs_and_shapes()
+        ratios = img_shapes.max(1, keepdims=True) / np.array([input_shape])
+
+        # resize to input shape
+        self.bbox_whs = bbox_whs / ratios
+
+    def get_whs_and_shapes(self):
+        """Get widths and heights of bboxes and shapes of images.
+
+        Returns:
+            tuple[np.ndarray]: Array of bbox shapes and array of image
+            shapes with shape (num_bboxes, 2) in [width, height] format.
+        """
+        self.logger.info('Collecting bboxes from annotation...')
+        bbox_whs = []
+        img_shapes = []
+        prog_bar = mmcv.ProgressBar(len(self.dataset))
+        for idx in range(len(self.dataset)):
+            ann = self.dataset.get_ann_info(idx)
+            data_info = self.dataset.data_infos[idx]
+            img_shape = np.array([data_info['width'], data_info['height']])
+            gt_bboxes = ann['bboxes']
+            for bbox in gt_bboxes:
+                wh = bbox[2:4] - bbox[0:2]
+                img_shapes.append(img_shape)
+                bbox_whs.append(wh)
+            prog_bar.update()
+        print('\n')
+        bbox_whs = np.array(bbox_whs)
+        img_shapes = np.array(img_shapes)
+        self.logger.info(f'Collected {bbox_whs.shape[0]} bboxes.')
+        return bbox_whs, img_shapes
+
+    def get_zero_center_bbox_tensor(self):
+        """Get a tensor of bboxes centered at (0, 0).
+
+        Returns:
+            Tensor: Tensor of bboxes with shape (num_bboxes, 4)
+            in [xmin, ymin, xmax, ymax] format.
+        """
+        whs = torch.from_numpy(self.bbox_whs).to(
+            self.device, dtype=torch.float32)
+        bboxes = bbox_cxcywh_to_xyxy(
+            torch.cat([torch.zeros_like(whs), whs], dim=1))
+        return bboxes
+
+    def optimize(self):
+        raise NotImplementedError
+
+    def save_result(self, anchors, path=None):
+        anchor_results = []
+        for w, h in anchors:
+            anchor_results.append([round(w), round(h)])
+        self.logger.info(f'Anchor optimize result:{anchor_results}')
+        if path:
+            json_path = osp.join(path, 'anchor_optimize_result.json')
+            mmcv.dump(anchor_results, json_path)
+            self.logger.info(f'Result saved in {json_path}')
+
+
+class YOLOKMeansAnchorOptimizer(BaseAnchorOptimizer):
+    r"""YOLO anchor optimizer using k-means. Code refer to `AlexeyAB/darknet.
+    <https://github.com/AlexeyAB/darknet/blob/master/src/detector.c>`_.
+
+    Args:
+        num_anchors (int) : Number of anchors.
+        iters (int): Maximum iterations for k-means.
+    """
+
+    def __init__(self, num_anchors, iters, **kwargs):
+
+        super(YOLOKMeansAnchorOptimizer, self).__init__(**kwargs)
+        self.num_anchors = num_anchors
+        self.iters = iters
+
+    def optimize(self):
+        anchors = self.kmeans_anchors()
+        self.save_result(anchors, self.out_dir)
+
+    def kmeans_anchors(self):
+        self.logger.info(
+            f'Start cluster {self.num_anchors} YOLO anchors with K-means...')
+        bboxes = self.get_zero_center_bbox_tensor()
+        cluster_center_idx = torch.randint(
+            0, bboxes.shape[0], (self.num_anchors, )).to(self.device)
+
+        assignments = torch.zeros((bboxes.shape[0], )).to(self.device)
+        cluster_centers = bboxes[cluster_center_idx]
+        if self.num_anchors == 1:
+            cluster_centers = self.kmeans_maximization(bboxes, assignments,
+                                                       cluster_centers)
+            anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy()
+            anchors = sorted(anchors, key=lambda x: x[0] * x[1])
+            return anchors
+
+        prog_bar = mmcv.ProgressBar(self.iters)
+        for i in range(self.iters):
+            converged, assignments = self.kmeans_expectation(
+                bboxes, assignments, cluster_centers)
+            if converged:
+                self.logger.info(f'K-means process has converged at iter {i}.')
+                break
+            cluster_centers = self.kmeans_maximization(bboxes, assignments,
+                                                       cluster_centers)
+            prog_bar.update()
+        print('\n')
+        avg_iou = bbox_overlaps(bboxes,
+                                cluster_centers).max(1)[0].mean().item()
+
+        anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy()
+        anchors = sorted(anchors, key=lambda x: x[0] * x[1])
+        self.logger.info(f'Anchor cluster finish. Average IOU: {avg_iou}')
+
+        return anchors
+
+    def kmeans_maximization(self, bboxes, assignments, centers):
+        """Maximization part of EM algorithm(Expectation-Maximization)"""
+        new_centers = torch.zeros_like(centers)
+        for i in range(centers.shape[0]):
+            mask = (assignments == i)
+            if mask.sum():
+                new_centers[i, :] = bboxes[mask].mean(0)
+        return new_centers
+
+    def kmeans_expectation(self, bboxes, assignments, centers):
+        """Expectation part of EM algorithm(Expectation-Maximization)"""
+        ious = bbox_overlaps(bboxes, centers)
+        closest = ious.argmax(1)
+        converged = (closest == assignments).all()
+        return converged, closest
+
+
+class YOLODEAnchorOptimizer(BaseAnchorOptimizer):
+    """YOLO anchor optimizer using differential evolution algorithm.
+
+    Args:
+        num_anchors (int) : Number of anchors.
+        iters (int): Maximum iterations for k-means.
+        strategy (str): The differential evolution strategy to use.
+            Should be one of:
+
+                - 'best1bin'
+                - 'best1exp'
+                - 'rand1exp'
+                - 'randtobest1exp'
+                - 'currenttobest1exp'
+                - 'best2exp'
+                - 'rand2exp'
+                - 'randtobest1bin'
+                - 'currenttobest1bin'
+                - 'best2bin'
+                - 'rand2bin'
+                - 'rand1bin'
+
+            Default: 'best1bin'.
+        population_size (int): Total population size of evolution algorithm.
+            Default: 15.
+        convergence_thr (float): Tolerance for convergence, the
+            optimizing stops when ``np.std(pop) <= abs(convergence_thr)
+            + convergence_thr * np.abs(np.mean(population_energies))``,
+            respectively. Default: 0.0001.
+        mutation (tuple[float]): Range of dithering randomly changes the
+            mutation constant. Default: (0.5, 1).
+        recombination (float): Recombination constant of crossover probability.
+            Default: 0.7.
+    """
+
+    def __init__(self,
+                 num_anchors,
+                 iters,
+                 strategy='best1bin',
+                 population_size=15,
+                 convergence_thr=0.0001,
+                 mutation=(0.5, 1),
+                 recombination=0.7,
+                 **kwargs):
+
+        super(YOLODEAnchorOptimizer, self).__init__(**kwargs)
+
+        self.num_anchors = num_anchors
+        self.iters = iters
+        self.strategy = strategy
+        self.population_size = population_size
+        self.convergence_thr = convergence_thr
+        self.mutation = mutation
+        self.recombination = recombination
+
+    def optimize(self):
+        anchors = self.differential_evolution()
+        self.save_result(anchors, self.out_dir)
+
+    def differential_evolution(self):
+        bboxes = self.get_zero_center_bbox_tensor()
+
+        bounds = []
+        for i in range(self.num_anchors):
+            bounds.extend([(0, self.input_shape[0]), (0, self.input_shape[1])])
+
+        result = differential_evolution(
+            func=self.avg_iou_cost,
+            bounds=bounds,
+            args=(bboxes, ),
+            strategy=self.strategy,
+            maxiter=self.iters,
+            popsize=self.population_size,
+            tol=self.convergence_thr,
+            mutation=self.mutation,
+            recombination=self.recombination,
+            updating='immediate',
+            disp=True)
+        self.logger.info(
+            f'Anchor evolution finish. Average IOU: {1 - result.fun}')
+        anchors = [(w, h) for w, h in zip(result.x[::2], result.x[1::2])]
+        anchors = sorted(anchors, key=lambda x: x[0] * x[1])
+        return anchors
+
+    @staticmethod
+    def avg_iou_cost(anchor_params, bboxes):
+        assert len(anchor_params) % 2 == 0
+        anchor_whs = torch.tensor(
+            [[w, h]
+             for w, h in zip(anchor_params[::2], anchor_params[1::2])]).to(
+                 bboxes.device, dtype=bboxes.dtype)
+        anchor_boxes = bbox_cxcywh_to_xyxy(
+            torch.cat([torch.zeros_like(anchor_whs), anchor_whs], dim=1))
+        ious = bbox_overlaps(bboxes, anchor_boxes)
+        max_ious, _ = ious.max(1)
+        cost = 1 - max_ious.mean().item()
+        return cost
+
+
+def main():
+    logger = get_root_logger()
+    args = parse_args()
+    cfg = args.config
+    cfg = Config.fromfile(cfg)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    input_shape = args.input_shape
+    assert len(input_shape) == 2
+
+    anchor_type = cfg.model.bbox_head.anchor_generator.type
+    assert anchor_type == 'YOLOAnchorGenerator', \
+        f'Only support optimize YOLOAnchor, but get {anchor_type}.'
+
+    base_sizes = cfg.model.bbox_head.anchor_generator.base_sizes
+    num_anchors = sum([len(sizes) for sizes in base_sizes])
+
+    train_data_cfg = cfg.data.train
+    while 'dataset' in train_data_cfg:
+        train_data_cfg = train_data_cfg['dataset']
+    dataset = build_dataset(train_data_cfg)
+
+    if args.algorithm == 'k-means':
+        optimizer = YOLOKMeansAnchorOptimizer(
+            dataset=dataset,
+            input_shape=input_shape,
+            device=args.device,
+            num_anchors=num_anchors,
+            iters=args.iters,
+            logger=logger,
+            out_dir=args.output_dir)
+    elif args.algorithm == 'differential_evolution':
+        optimizer = YOLODEAnchorOptimizer(
+            dataset=dataset,
+            input_shape=input_shape,
+            device=args.device,
+            num_anchors=num_anchors,
+            iters=args.iters,
+            logger=logger,
+            out_dir=args.output_dir)
+    else:
+        raise NotImplementedError(
+            f'Only support k-means and differential_evolution, '
+            f'but get {args.algorithm}')
+
+    optimizer.optimize()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/robustness_eval.py b/tools/analysis_tools/robustness_eval.py
new file mode 100755
index 0000000..da5ec28
--- /dev/null
+++ b/tools/analysis_tools/robustness_eval.py
@@ -0,0 +1,251 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from argparse import ArgumentParser
+
+import mmcv
+import numpy as np
+
+
+def print_coco_results(results):
+
+    def _print(result, ap=1, iouThr=None, areaRng='all', maxDets=100):
+        titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+        typeStr = '(AP)' if ap == 1 else '(AR)'
+        iouStr = '0.50:0.95' \
+            if iouThr is None else f'{iouThr:0.2f}'
+        iStr = f' {titleStr:<18} {typeStr} @[ IoU={iouStr:<9} | '
+        iStr += f'area={areaRng:>6s} | maxDets={maxDets:>3d} ] = {result:0.3f}'
+        print(iStr)
+
+    stats = np.zeros((12, ))
+    stats[0] = _print(results[0], 1)
+    stats[1] = _print(results[1], 1, iouThr=.5)
+    stats[2] = _print(results[2], 1, iouThr=.75)
+    stats[3] = _print(results[3], 1, areaRng='small')
+    stats[4] = _print(results[4], 1, areaRng='medium')
+    stats[5] = _print(results[5], 1, areaRng='large')
+    stats[6] = _print(results[6], 0, maxDets=1)
+    stats[7] = _print(results[7], 0, maxDets=10)
+    stats[8] = _print(results[8], 0)
+    stats[9] = _print(results[9], 0, areaRng='small')
+    stats[10] = _print(results[10], 0, areaRng='medium')
+    stats[11] = _print(results[11], 0, areaRng='large')
+
+
+def get_coco_style_results(filename,
+                           task='bbox',
+                           metric=None,
+                           prints='mPC',
+                           aggregate='benchmark'):
+
+    assert aggregate in ['benchmark', 'all']
+
+    if prints == 'all':
+        prints = ['P', 'mPC', 'rPC']
+    elif isinstance(prints, str):
+        prints = [prints]
+    for p in prints:
+        assert p in ['P', 'mPC', 'rPC']
+
+    if metric is None:
+        metrics = [
+            'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'AR1', 'AR10', 'AR100',
+            'ARs', 'ARm', 'ARl'
+        ]
+    elif isinstance(metric, list):
+        metrics = metric
+    else:
+        metrics = [metric]
+
+    for metric_name in metrics:
+        assert metric_name in [
+            'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'AR1', 'AR10', 'AR100',
+            'ARs', 'ARm', 'ARl'
+        ]
+
+    eval_output = mmcv.load(filename)
+
+    num_distortions = len(list(eval_output.keys()))
+    results = np.zeros((num_distortions, 6, len(metrics)), dtype='float32')
+
+    for corr_i, distortion in enumerate(eval_output):
+        for severity in eval_output[distortion]:
+            for metric_j, metric_name in enumerate(metrics):
+                mAP = eval_output[distortion][severity][task][metric_name]
+                results[corr_i, severity, metric_j] = mAP
+
+    P = results[0, 0, :]
+    if aggregate == 'benchmark':
+        mPC = np.mean(results[:15, 1:, :], axis=(0, 1))
+    else:
+        mPC = np.mean(results[:, 1:, :], axis=(0, 1))
+    rPC = mPC / P
+
+    print(f'\nmodel: {osp.basename(filename)}')
+    if metric is None:
+        if 'P' in prints:
+            print(f'Performance on Clean Data [P] ({task})')
+            print_coco_results(P)
+        if 'mPC' in prints:
+            print(f'Mean Performance under Corruption [mPC] ({task})')
+            print_coco_results(mPC)
+        if 'rPC' in prints:
+            print(f'Relative Performance under Corruption [rPC] ({task})')
+            print_coco_results(rPC)
+    else:
+        if 'P' in prints:
+            print(f'Performance on Clean Data [P] ({task})')
+            for metric_i, metric_name in enumerate(metrics):
+                print(f'{metric_name:5} =  {P[metric_i]:0.3f}')
+        if 'mPC' in prints:
+            print(f'Mean Performance under Corruption [mPC] ({task})')
+            for metric_i, metric_name in enumerate(metrics):
+                print(f'{metric_name:5} =  {mPC[metric_i]:0.3f}')
+        if 'rPC' in prints:
+            print(f'Relative Performance under Corruption [rPC] ({task})')
+            for metric_i, metric_name in enumerate(metrics):
+                print(f'{metric_name:5} => {rPC[metric_i] * 100:0.1f} %')
+
+    return results
+
+
+def get_voc_style_results(filename, prints='mPC', aggregate='benchmark'):
+
+    assert aggregate in ['benchmark', 'all']
+
+    if prints == 'all':
+        prints = ['P', 'mPC', 'rPC']
+    elif isinstance(prints, str):
+        prints = [prints]
+    for p in prints:
+        assert p in ['P', 'mPC', 'rPC']
+
+    eval_output = mmcv.load(filename)
+
+    num_distortions = len(list(eval_output.keys()))
+    results = np.zeros((num_distortions, 6, 20), dtype='float32')
+
+    for i, distortion in enumerate(eval_output):
+        for severity in eval_output[distortion]:
+            mAP = [
+                eval_output[distortion][severity][j]['ap']
+                for j in range(len(eval_output[distortion][severity]))
+            ]
+            results[i, severity, :] = mAP
+
+    P = results[0, 0, :]
+    if aggregate == 'benchmark':
+        mPC = np.mean(results[:15, 1:, :], axis=(0, 1))
+    else:
+        mPC = np.mean(results[:, 1:, :], axis=(0, 1))
+    rPC = mPC / P
+
+    print(f'\nmodel: {osp.basename(filename)}')
+    if 'P' in prints:
+        print(f'Performance on Clean Data [P] in AP50 = {np.mean(P):0.3f}')
+    if 'mPC' in prints:
+        print('Mean Performance under Corruption [mPC] in AP50 = '
+              f'{np.mean(mPC):0.3f}')
+    if 'rPC' in prints:
+        print('Relative Performance under Corruption [rPC] in % = '
+              f'{np.mean(rPC) * 100:0.1f}')
+
+    return np.mean(results, axis=2, keepdims=True)
+
+
+def get_results(filename,
+                dataset='coco',
+                task='bbox',
+                metric=None,
+                prints='mPC',
+                aggregate='benchmark'):
+    assert dataset in ['coco', 'voc', 'cityscapes']
+
+    if dataset in ['coco', 'cityscapes']:
+        results = get_coco_style_results(
+            filename,
+            task=task,
+            metric=metric,
+            prints=prints,
+            aggregate=aggregate)
+    elif dataset == 'voc':
+        if task != 'bbox':
+            print('Only bbox analysis is supported for Pascal VOC')
+            print('Will report bbox results\n')
+        if metric not in [None, ['AP'], ['AP50']]:
+            print('Only the AP50 metric is supported for Pascal VOC')
+            print('Will report AP50 metric\n')
+        results = get_voc_style_results(
+            filename, prints=prints, aggregate=aggregate)
+
+    return results
+
+
+def get_distortions_from_file(filename):
+
+    eval_output = mmcv.load(filename)
+
+    return get_distortions_from_results(eval_output)
+
+
+def get_distortions_from_results(eval_output):
+    distortions = []
+    for i, distortion in enumerate(eval_output):
+        distortions.append(distortion.replace('_', ' '))
+    return distortions
+
+
+def main():
+    parser = ArgumentParser(description='Corruption Result Analysis')
+    parser.add_argument('filename', help='result file path')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        choices=['coco', 'voc', 'cityscapes'],
+        default='coco',
+        help='dataset type')
+    parser.add_argument(
+        '--task',
+        type=str,
+        nargs='+',
+        choices=['bbox', 'segm'],
+        default=['bbox'],
+        help='task to report')
+    parser.add_argument(
+        '--metric',
+        nargs='+',
+        choices=[
+            None, 'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'AR1', 'AR10',
+            'AR100', 'ARs', 'ARm', 'ARl'
+        ],
+        default=None,
+        help='metric to report')
+    parser.add_argument(
+        '--prints',
+        type=str,
+        nargs='+',
+        choices=['P', 'mPC', 'rPC'],
+        default='mPC',
+        help='corruption benchmark metric to print')
+    parser.add_argument(
+        '--aggregate',
+        type=str,
+        choices=['all', 'benchmark'],
+        default='benchmark',
+        help='aggregate all results or only those \
+        for benchmark corruptions')
+
+    args = parser.parse_args()
+
+    for task in args.task:
+        get_results(
+            args.filename,
+            dataset=args.dataset,
+            task=task,
+            metric=args.metric,
+            prints=args.prints,
+            aggregate=args.aggregate)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/test_robustness.py b/tools/analysis_tools/test_robustness.py
new file mode 100755
index 0000000..0c1ddbe
--- /dev/null
+++ b/tools/analysis_tools/test_robustness.py
@@ -0,0 +1,387 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+
+import mmcv
+import torch
+from mmcv import DictAction
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from mmdet import datasets
+from mmdet.apis import multi_gpu_test, set_random_seed, single_gpu_test
+from mmdet.core import eval_map
+from mmdet.datasets import build_dataloader, build_dataset
+from mmdet.models import build_detector
+from tools.analysis_tools.robustness_eval import get_results
+
+
+def coco_eval_with_return(result_files,
+                          result_types,
+                          coco,
+                          max_dets=(100, 300, 1000)):
+    for res_type in result_types:
+        assert res_type in ['proposal', 'bbox', 'segm', 'keypoints']
+
+    if mmcv.is_str(coco):
+        coco = COCO(coco)
+    assert isinstance(coco, COCO)
+
+    eval_results = {}
+    for res_type in result_types:
+        result_file = result_files[res_type]
+        assert result_file.endswith('.json')
+
+        coco_dets = coco.loadRes(result_file)
+        img_ids = coco.getImgIds()
+        iou_type = 'bbox' if res_type == 'proposal' else res_type
+        cocoEval = COCOeval(coco, coco_dets, iou_type)
+        cocoEval.params.imgIds = img_ids
+        if res_type == 'proposal':
+            cocoEval.params.useCats = 0
+            cocoEval.params.maxDets = list(max_dets)
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+        if res_type == 'segm' or res_type == 'bbox':
+            metric_names = [
+                'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'AR1', 'AR10',
+                'AR100', 'ARs', 'ARm', 'ARl'
+            ]
+            eval_results[res_type] = {
+                metric_names[i]: cocoEval.stats[i]
+                for i in range(len(metric_names))
+            }
+        else:
+            eval_results[res_type] = cocoEval.stats
+
+    return eval_results
+
+
+def voc_eval_with_return(result_file,
+                         dataset,
+                         iou_thr=0.5,
+                         logger='print',
+                         only_ap=True):
+    det_results = mmcv.load(result_file)
+    annotations = [dataset.get_ann_info(i) for i in range(len(dataset))]
+    if hasattr(dataset, 'year') and dataset.year == 2007:
+        dataset_name = 'voc07'
+    else:
+        dataset_name = dataset.CLASSES
+    mean_ap, eval_results = eval_map(
+        det_results,
+        annotations,
+        scale_ranges=None,
+        iou_thr=iou_thr,
+        dataset=dataset_name,
+        logger=logger)
+
+    if only_ap:
+        eval_results = [{
+            'ap': eval_results[i]['ap']
+        } for i in range(len(eval_results))]
+
+    return mean_ap, eval_results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet test detector')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='output result file')
+    parser.add_argument(
+        '--corruptions',
+        type=str,
+        nargs='+',
+        default='benchmark',
+        choices=[
+            'all', 'benchmark', 'noise', 'blur', 'weather', 'digital',
+            'holdout', 'None', 'gaussian_noise', 'shot_noise', 'impulse_noise',
+            'defocus_blur', 'glass_blur', 'motion_blur', 'zoom_blur', 'snow',
+            'frost', 'fog', 'brightness', 'contrast', 'elastic_transform',
+            'pixelate', 'jpeg_compression', 'speckle_noise', 'gaussian_blur',
+            'spatter', 'saturate'
+        ],
+        help='corruptions')
+    parser.add_argument(
+        '--severities',
+        type=int,
+        nargs='+',
+        default=[0, 1, 2, 3, 4, 5],
+        help='corruption severity levels')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'],
+        help='eval types')
+    parser.add_argument(
+        '--iou-thr',
+        type=float,
+        default=0.5,
+        help='IoU threshold for pascal voc evaluation')
+    parser.add_argument(
+        '--summaries',
+        type=bool,
+        default=False,
+        help='Print summaries for every corruption and severity')
+    parser.add_argument(
+        '--workers', type=int, default=32, help='workers per gpu')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold (default: 0.3)')
+    parser.add_argument('--tmpdir', help='tmp dir for writing some results')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--final-prints',
+        type=str,
+        nargs='+',
+        choices=['P', 'mPC', 'rPC'],
+        default='mPC',
+        help='corruption benchmark metric to print at the end')
+    parser.add_argument(
+        '--final-prints-aggregate',
+        type=str,
+        choices=['all', 'benchmark'],
+        default='benchmark',
+        help='aggregate all results or only those for benchmark corruptions')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.show or args.show_dir, \
+        ('Please specify at least one operation (save or show the results) '
+         'with the argument "--out", "--show" or "show-dir"')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+    if args.workers == 0:
+        args.workers = cfg.data.workers_per_gpu
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # set random seeds
+    if args.seed is not None:
+        set_random_seed(args.seed)
+
+    if 'all' in args.corruptions:
+        corruptions = [
+            'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
+            'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
+            'brightness', 'contrast', 'elastic_transform', 'pixelate',
+            'jpeg_compression', 'speckle_noise', 'gaussian_blur', 'spatter',
+            'saturate'
+        ]
+    elif 'benchmark' in args.corruptions:
+        corruptions = [
+            'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
+            'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
+            'brightness', 'contrast', 'elastic_transform', 'pixelate',
+            'jpeg_compression'
+        ]
+    elif 'noise' in args.corruptions:
+        corruptions = ['gaussian_noise', 'shot_noise', 'impulse_noise']
+    elif 'blur' in args.corruptions:
+        corruptions = [
+            'defocus_blur', 'glass_blur', 'motion_blur', 'zoom_blur'
+        ]
+    elif 'weather' in args.corruptions:
+        corruptions = ['snow', 'frost', 'fog', 'brightness']
+    elif 'digital' in args.corruptions:
+        corruptions = [
+            'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression'
+        ]
+    elif 'holdout' in args.corruptions:
+        corruptions = ['speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
+    elif 'None' in args.corruptions:
+        corruptions = ['None']
+        args.severities = [0]
+    else:
+        corruptions = args.corruptions
+
+    rank, _ = get_dist_info()
+    aggregated_results = {}
+    for corr_i, corruption in enumerate(corruptions):
+        aggregated_results[corruption] = {}
+        for sev_i, corruption_severity in enumerate(args.severities):
+            # evaluate severity 0 (= no corruption) only once
+            if corr_i > 0 and corruption_severity == 0:
+                aggregated_results[corruption][0] = \
+                    aggregated_results[corruptions[0]][0]
+                continue
+
+            test_data_cfg = copy.deepcopy(cfg.data.test)
+            # assign corruption and severity
+            if corruption_severity > 0:
+                corruption_trans = dict(
+                    type='Corrupt',
+                    corruption=corruption,
+                    severity=corruption_severity)
+                # TODO: hard coded "1", we assume that the first step is
+                # loading images, which needs to be fixed in the future
+                test_data_cfg['pipeline'].insert(1, corruption_trans)
+
+            # print info
+            print(f'\nTesting {corruption} at severity {corruption_severity}')
+
+            # build the dataloader
+            # TODO: support multiple images per gpu
+            #       (only minor changes are needed)
+            dataset = build_dataset(test_data_cfg)
+            data_loader = build_dataloader(
+                dataset,
+                samples_per_gpu=1,
+                workers_per_gpu=args.workers,
+                dist=distributed,
+                shuffle=False)
+
+            # build the model and load checkpoint
+            cfg.model.train_cfg = None
+            model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+            fp16_cfg = cfg.get('fp16', None)
+            if fp16_cfg is not None:
+                wrap_fp16_model(model)
+            checkpoint = load_checkpoint(
+                model, args.checkpoint, map_location='cpu')
+            # old versions did not save class info in checkpoints,
+            # this walkaround is for backward compatibility
+            if 'CLASSES' in checkpoint.get('meta', {}):
+                model.CLASSES = checkpoint['meta']['CLASSES']
+            else:
+                model.CLASSES = dataset.CLASSES
+
+            if not distributed:
+                model = MMDataParallel(model, device_ids=[0])
+                show_dir = args.show_dir
+                if show_dir is not None:
+                    show_dir = osp.join(show_dir, corruption)
+                    show_dir = osp.join(show_dir, str(corruption_severity))
+                    if not osp.exists(show_dir):
+                        osp.makedirs(show_dir)
+                outputs = single_gpu_test(model, data_loader, args.show,
+                                          show_dir, args.show_score_thr)
+            else:
+                model = MMDistributedDataParallel(
+                    model.cuda(),
+                    device_ids=[torch.cuda.current_device()],
+                    broadcast_buffers=False)
+                outputs = multi_gpu_test(model, data_loader, args.tmpdir)
+
+            if args.out and rank == 0:
+                eval_results_filename = (
+                    osp.splitext(args.out)[0] + '_results' +
+                    osp.splitext(args.out)[1])
+                mmcv.dump(outputs, args.out)
+                eval_types = args.eval
+                if cfg.dataset_type == 'VOCDataset':
+                    if eval_types:
+                        for eval_type in eval_types:
+                            if eval_type == 'bbox':
+                                test_dataset = mmcv.runner.obj_from_dict(
+                                    cfg.data.test, datasets)
+                                logger = 'print' if args.summaries else None
+                                mean_ap, eval_results = \
+                                    voc_eval_with_return(
+                                        args.out, test_dataset,
+                                        args.iou_thr, logger)
+                                aggregated_results[corruption][
+                                    corruption_severity] = eval_results
+                            else:
+                                print('\nOnly "bbox" evaluation \
+                                is supported for pascal voc')
+                else:
+                    if eval_types:
+                        print(f'Starting evaluate {" and ".join(eval_types)}')
+                        if eval_types == ['proposal_fast']:
+                            result_file = args.out
+                        else:
+                            if not isinstance(outputs[0], dict):
+                                result_files = dataset.results2json(
+                                    outputs, args.out)
+                            else:
+                                for name in outputs[0]:
+                                    print(f'\nEvaluating {name}')
+                                    outputs_ = [out[name] for out in outputs]
+                                    result_file = args.out
+                                    + f'.{name}'
+                                    result_files = dataset.results2json(
+                                        outputs_, result_file)
+                        eval_results = coco_eval_with_return(
+                            result_files, eval_types, dataset.coco)
+                        aggregated_results[corruption][
+                            corruption_severity] = eval_results
+                    else:
+                        print('\nNo task was selected for evaluation;'
+                              '\nUse --eval to select a task')
+
+                # save results after each evaluation
+                mmcv.dump(aggregated_results, eval_results_filename)
+
+    if rank == 0:
+        # print final results
+        print('\nAggregated results:')
+        prints = args.final_prints
+        aggregate = args.final_prints_aggregate
+
+        if cfg.dataset_type == 'VOCDataset':
+            get_results(
+                eval_results_filename,
+                dataset='voc',
+                prints=prints,
+                aggregate=aggregate)
+        else:
+            get_results(
+                eval_results_filename,
+                dataset='coco',
+                prints=prints,
+                aggregate=aggregate)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/convert_datasets/cityscapes_local.py b/tools/convert_datasets/cityscapes_local.py
new file mode 100755
index 0000000..461b14e
--- /dev/null
+++ b/tools/convert_datasets/cityscapes_local.py
@@ -0,0 +1,153 @@
+import argparse
+import glob
+import os.path as osp
+
+import cityscapesscripts.helpers.labels as CSLabels
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+
+
+def collect_files(img_dir, gt_dir):
+    suffix = 'leftImg8bit.png'
+    files = []
+    for img_file in glob.glob(osp.join(img_dir, '**/*.png')):
+        # assert img_file.endswith(suffix), img_file
+        if not img_file.endswith(suffix):
+            continue
+        inst_file = gt_dir + img_file[
+            len(img_dir):-len(suffix)] + 'gtFine_instanceIds.png'
+        # Note that labelIds are not converted to trainId for seg map
+        segm_file = gt_dir + img_file[
+            len(img_dir):-len(suffix)] + 'gtFine_labelIds.png'
+        files.append((img_file, inst_file, segm_file))
+    assert len(files), f'No images found in {img_dir}'
+    print(f'Loaded {len(files)} images from {img_dir}')
+
+    return files
+
+
+def collect_annotations(files, nproc=1):
+    print('Loading annotation images')
+    if nproc > 1:
+        images = mmcv.track_parallel_progress(
+            load_img_info, files, nproc=nproc)
+    else:
+        images = mmcv.track_progress(load_img_info, files)
+
+    return images
+
+
+def load_img_info(files):
+    img_file, inst_file, segm_file = files
+    inst_img = mmcv.imread(inst_file, 'unchanged')
+    # ids < 24 are stuff labels (filtering them first is about 5% faster)
+    unique_inst_ids = np.unique(inst_img[inst_img >= 24])
+    anno_info = []
+    for inst_id in unique_inst_ids:
+        # For non-crowd annotations, inst_id // 1000 is the label_id
+        # Crowd annotations have <1000 instance ids
+        label_id = inst_id // 1000 if inst_id >= 1000 else inst_id
+        label = CSLabels.id2label[label_id]
+        if not label.hasInstances or label.ignoreInEval:
+            continue
+
+        category_id = label.id
+        iscrowd = int(inst_id < 1000)
+        mask = np.asarray(inst_img == inst_id, dtype=np.uint8, order='F')
+        mask_rle = maskUtils.encode(mask[:, :, None])[0]
+
+        area = maskUtils.area(mask_rle)
+        # convert to COCO style XYWH format
+        bbox = maskUtils.toBbox(mask_rle)
+
+        # for json encoding
+        mask_rle['counts'] = mask_rle['counts'].decode()
+
+        anno = dict(
+            iscrowd=iscrowd,
+            category_id=category_id,
+            bbox=bbox.tolist(),
+            area=area.tolist(),
+            segmentation=mask_rle)
+        anno_info.append(anno)
+    video_name = osp.basename(osp.dirname(img_file))
+    img_info = dict(
+        # remove img_prefix for filename
+        file_name=osp.join(video_name, osp.basename(img_file)),
+        height=inst_img.shape[0],
+        width=inst_img.shape[1],
+        anno_info=anno_info,
+        segm_file=osp.join(video_name, osp.basename(segm_file)))
+
+    return img_info
+
+
+def cvt_annotations(image_infos, out_json_name):
+    out_json = dict()
+    img_id = 0
+    ann_id = 0
+    out_json['images'] = []
+    out_json['categories'] = []
+    out_json['annotations'] = []
+    for image_info in image_infos:
+        image_info['id'] = img_id
+        anno_infos = image_info.pop('anno_info')
+        out_json['images'].append(image_info)
+        for anno_info in anno_infos:
+            anno_info['image_id'] = img_id
+            anno_info['id'] = ann_id
+            out_json['annotations'].append(anno_info)
+            ann_id += 1
+        img_id += 1
+    for label in CSLabels.labels:
+        if label.hasInstances and not label.ignoreInEval:
+            cat = dict(id=label.id, name=label.name)
+            out_json['categories'].append(cat)
+
+    if len(out_json['annotations']) == 0:
+        out_json.pop('annotations')
+
+    mmcv.dump(out_json, out_json_name)
+    return out_json
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert Cityscapes annotations to COCO format')
+    parser.add_argument('cityscapes_path', help='cityscapes data path')
+    parser.add_argument('--img-dir', default='leftImg8bit', type=str)
+    parser.add_argument('--gt-dir', default='gtFine', type=str)
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cityscapes_path = args.cityscapes_path
+    out_dir = args.out_dir if args.out_dir else cityscapes_path
+    mmcv.mkdir_or_exist(out_dir)
+
+    img_dir = osp.join(cityscapes_path, args.img_dir)
+    gt_dir = osp.join(cityscapes_path, args.gt_dir)
+
+    set_name = dict(
+        train='instancesonly_filtered_gtFine_train.json',
+        val='instancesonly_filtered_gtFine_val.json',
+        test='instancesonly_filtered_gtFine_test.json')
+
+    for split, json_name in set_name.items():
+        print(f'Converting {split} into {json_name}')
+        with mmcv.Timer(
+                print_tmpl='It tooks {}s to convert Cityscapes annotation'):
+            files = collect_files(
+                osp.join(img_dir, split), osp.join(gt_dir, split))
+            image_infos = collect_annotations(files, nproc=args.nproc)
+            cvt_annotations(image_infos, osp.join(out_dir, json_name))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dataset_converters/cityscapes.py b/tools/dataset_converters/cityscapes.py
new file mode 100755
index 0000000..c8e44b9
--- /dev/null
+++ b/tools/dataset_converters/cityscapes.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os.path as osp
+
+import cityscapesscripts.helpers.labels as CSLabels
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+
+
+def collect_files(img_dir, gt_dir):
+    suffix = 'leftImg8bit.png'
+    files = []
+    for img_file in glob.glob(osp.join(img_dir, '**/*.png')):
+        assert img_file.endswith(suffix), img_file
+        inst_file = gt_dir + img_file[
+            len(img_dir):-len(suffix)] + 'gtFine_instanceIds.png'
+        # Note that labelIds are not converted to trainId for seg map
+        segm_file = gt_dir + img_file[
+            len(img_dir):-len(suffix)] + 'gtFine_labelIds.png'
+        files.append((img_file, inst_file, segm_file))
+    assert len(files), f'No images found in {img_dir}'
+    print(f'Loaded {len(files)} images from {img_dir}')
+
+    return files
+
+
+def collect_annotations(files, nproc=1):
+    print('Loading annotation images')
+    if nproc > 1:
+        images = mmcv.track_parallel_progress(
+            load_img_info, files, nproc=nproc)
+    else:
+        images = mmcv.track_progress(load_img_info, files)
+
+    return images
+
+
+def load_img_info(files):
+    img_file, inst_file, segm_file = files
+    inst_img = mmcv.imread(inst_file, 'unchanged')
+    # ids < 24 are stuff labels (filtering them first is about 5% faster)
+    unique_inst_ids = np.unique(inst_img[inst_img >= 24])
+    anno_info = []
+    for inst_id in unique_inst_ids:
+        # For non-crowd annotations, inst_id // 1000 is the label_id
+        # Crowd annotations have <1000 instance ids
+        label_id = inst_id // 1000 if inst_id >= 1000 else inst_id
+        label = CSLabels.id2label[label_id]
+        if not label.hasInstances or label.ignoreInEval:
+            continue
+
+        category_id = label.id
+        iscrowd = int(inst_id < 1000)
+        mask = np.asarray(inst_img == inst_id, dtype=np.uint8, order='F')
+        mask_rle = maskUtils.encode(mask[:, :, None])[0]
+
+        area = maskUtils.area(mask_rle)
+        # convert to COCO style XYWH format
+        bbox = maskUtils.toBbox(mask_rle)
+
+        # for json encoding
+        mask_rle['counts'] = mask_rle['counts'].decode()
+
+        anno = dict(
+            iscrowd=iscrowd,
+            category_id=category_id,
+            bbox=bbox.tolist(),
+            area=area.tolist(),
+            segmentation=mask_rle)
+        anno_info.append(anno)
+    video_name = osp.basename(osp.dirname(img_file))
+    img_info = dict(
+        # remove img_prefix for filename
+        file_name=osp.join(video_name, osp.basename(img_file)),
+        height=inst_img.shape[0],
+        width=inst_img.shape[1],
+        anno_info=anno_info,
+        segm_file=osp.join(video_name, osp.basename(segm_file)))
+
+    return img_info
+
+
+def cvt_annotations(image_infos, out_json_name):
+    out_json = dict()
+    img_id = 0
+    ann_id = 0
+    out_json['images'] = []
+    out_json['categories'] = []
+    out_json['annotations'] = []
+    for image_info in image_infos:
+        image_info['id'] = img_id
+        anno_infos = image_info.pop('anno_info')
+        out_json['images'].append(image_info)
+        for anno_info in anno_infos:
+            anno_info['image_id'] = img_id
+            anno_info['id'] = ann_id
+            out_json['annotations'].append(anno_info)
+            ann_id += 1
+        img_id += 1
+    for label in CSLabels.labels:
+        if label.hasInstances and not label.ignoreInEval:
+            cat = dict(id=label.id, name=label.name)
+            out_json['categories'].append(cat)
+
+    if len(out_json['annotations']) == 0:
+        out_json.pop('annotations')
+
+    mmcv.dump(out_json, out_json_name)
+    return out_json
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert Cityscapes annotations to COCO format')
+    parser.add_argument('cityscapes_path', help='cityscapes data path')
+    parser.add_argument('--img-dir', default='leftImg8bit', type=str)
+    parser.add_argument('--gt-dir', default='gtFine', type=str)
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cityscapes_path = args.cityscapes_path
+    out_dir = args.out_dir if args.out_dir else cityscapes_path
+    mmcv.mkdir_or_exist(out_dir)
+
+    img_dir = osp.join(cityscapes_path, args.img_dir)
+    gt_dir = osp.join(cityscapes_path, args.gt_dir)
+
+    set_name = dict(
+        train='instancesonly_filtered_gtFine_train.json',
+        val='instancesonly_filtered_gtFine_val.json',
+        test='instancesonly_filtered_gtFine_test.json')
+
+    for split, json_name in set_name.items():
+        print(f'Converting {split} into {json_name}')
+        with mmcv.Timer(
+                print_tmpl='It took {}s to convert Cityscapes annotation'):
+            files = collect_files(
+                osp.join(img_dir, split), osp.join(gt_dir, split))
+            image_infos = collect_annotations(files, nproc=args.nproc)
+            cvt_annotations(image_infos, osp.join(out_dir, json_name))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dataset_converters/images2coco.py b/tools/dataset_converters/images2coco.py
new file mode 100755
index 0000000..1c4e2f1
--- /dev/null
+++ b/tools/dataset_converters/images2coco.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+import mmcv
+from PIL import Image
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert images to coco format without annotations')
+    parser.add_argument('img_path', help='The root path of images')
+    parser.add_argument(
+        'classes', type=str, help='The text file name of storage class list')
+    parser.add_argument(
+        'out',
+        type=str,
+        help='The output annotation json file name, The save dir is in the '
+        'same directory as img_path')
+    parser.add_argument(
+        '-e',
+        '--exclude-extensions',
+        type=str,
+        nargs='+',
+        help='The suffix of images to be excluded, such as "png" and "bmp"')
+    args = parser.parse_args()
+    return args
+
+
+def collect_image_infos(path, exclude_extensions=None):
+    img_infos = []
+
+    images_generator = mmcv.scandir(path, recursive=True)
+    for image_path in mmcv.track_iter_progress(list(images_generator)):
+        if exclude_extensions is None or (
+                exclude_extensions is not None
+                and not image_path.lower().endswith(exclude_extensions)):
+            image_path = os.path.join(path, image_path)
+            img_pillow = Image.open(image_path)
+            img_info = {
+                'filename': image_path,
+                'width': img_pillow.width,
+                'height': img_pillow.height,
+            }
+            img_infos.append(img_info)
+    return img_infos
+
+
+def cvt_to_coco_json(img_infos, classes):
+    image_id = 0
+    coco = dict()
+    coco['images'] = []
+    coco['type'] = 'instance'
+    coco['categories'] = []
+    coco['annotations'] = []
+    image_set = set()
+
+    for category_id, name in enumerate(classes):
+        category_item = dict()
+        category_item['supercategory'] = str('none')
+        category_item['id'] = int(category_id)
+        category_item['name'] = str(name)
+        coco['categories'].append(category_item)
+
+    for img_dict in img_infos:
+        file_name = img_dict['filename']
+        assert file_name not in image_set
+        image_item = dict()
+        image_item['id'] = int(image_id)
+        image_item['file_name'] = str(file_name)
+        image_item['height'] = int(img_dict['height'])
+        image_item['width'] = int(img_dict['width'])
+        coco['images'].append(image_item)
+        image_set.add(file_name)
+
+        image_id += 1
+    return coco
+
+
+def main():
+    args = parse_args()
+    assert args.out.endswith(
+        'json'), 'The output file name must be json suffix'
+
+    # 1 load image list info
+    img_infos = collect_image_infos(args.img_path, args.exclude_extensions)
+
+    # 2 convert to coco format data
+    classes = mmcv.list_from_file(args.classes)
+    coco_info = cvt_to_coco_json(img_infos, classes)
+
+    # 3 dump
+    save_dir = os.path.join(args.img_path, '..', 'annotations')
+    mmcv.mkdir_or_exist(save_dir)
+    save_path = os.path.join(save_dir, args.out)
+    mmcv.dump(coco_info, save_path)
+    print(f'save json file: {save_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dataset_converters/pascal_voc.py b/tools/dataset_converters/pascal_voc.py
new file mode 100755
index 0000000..20f8801
--- /dev/null
+++ b/tools/dataset_converters/pascal_voc.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+import mmcv
+import numpy as np
+
+from mmdet.core import voc_classes
+
+label_ids = {name: i for i, name in enumerate(voc_classes())}
+
+
+def parse_xml(args):
+    xml_path, img_path = args
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    size = root.find('size')
+    w = int(size.find('width').text)
+    h = int(size.find('height').text)
+    bboxes = []
+    labels = []
+    bboxes_ignore = []
+    labels_ignore = []
+    for obj in root.findall('object'):
+        name = obj.find('name').text
+        label = label_ids[name]
+        difficult = int(obj.find('difficult').text)
+        bnd_box = obj.find('bndbox')
+        bbox = [
+            int(bnd_box.find('xmin').text),
+            int(bnd_box.find('ymin').text),
+            int(bnd_box.find('xmax').text),
+            int(bnd_box.find('ymax').text)
+        ]
+        if difficult:
+            bboxes_ignore.append(bbox)
+            labels_ignore.append(label)
+        else:
+            bboxes.append(bbox)
+            labels.append(label)
+    if not bboxes:
+        bboxes = np.zeros((0, 4))
+        labels = np.zeros((0, ))
+    else:
+        bboxes = np.array(bboxes, ndmin=2) - 1
+        labels = np.array(labels)
+    if not bboxes_ignore:
+        bboxes_ignore = np.zeros((0, 4))
+        labels_ignore = np.zeros((0, ))
+    else:
+        bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
+        labels_ignore = np.array(labels_ignore)
+    annotation = {
+        'filename': img_path,
+        'width': w,
+        'height': h,
+        'ann': {
+            'bboxes': bboxes.astype(np.float32),
+            'labels': labels.astype(np.int64),
+            'bboxes_ignore': bboxes_ignore.astype(np.float32),
+            'labels_ignore': labels_ignore.astype(np.int64)
+        }
+    }
+    return annotation
+
+
+def cvt_annotations(devkit_path, years, split, out_file):
+    if not isinstance(years, list):
+        years = [years]
+    annotations = []
+    for year in years:
+        filelist = osp.join(devkit_path,
+                            f'VOC{year}/ImageSets/Main/{split}.txt')
+        if not osp.isfile(filelist):
+            print(f'filelist does not exist: {filelist}, '
+                  f'skip voc{year} {split}')
+            return
+        img_names = mmcv.list_from_file(filelist)
+        xml_paths = [
+            osp.join(devkit_path, f'VOC{year}/Annotations/{img_name}.xml')
+            for img_name in img_names
+        ]
+        img_paths = [
+            f'VOC{year}/JPEGImages/{img_name}.jpg' for img_name in img_names
+        ]
+        part_annotations = mmcv.track_progress(parse_xml,
+                                               list(zip(xml_paths, img_paths)))
+        annotations.extend(part_annotations)
+    if out_file.endswith('json'):
+        annotations = cvt_to_coco_json(annotations)
+    mmcv.dump(annotations, out_file)
+    return annotations
+
+
+def cvt_to_coco_json(annotations):
+    image_id = 0
+    annotation_id = 0
+    coco = dict()
+    coco['images'] = []
+    coco['type'] = 'instance'
+    coco['categories'] = []
+    coco['annotations'] = []
+    image_set = set()
+
+    def addAnnItem(annotation_id, image_id, category_id, bbox, difficult_flag):
+        annotation_item = dict()
+        annotation_item['segmentation'] = []
+
+        seg = []
+        # bbox[] is x1,y1,x2,y2
+        # left_top
+        seg.append(int(bbox[0]))
+        seg.append(int(bbox[1]))
+        # left_bottom
+        seg.append(int(bbox[0]))
+        seg.append(int(bbox[3]))
+        # right_bottom
+        seg.append(int(bbox[2]))
+        seg.append(int(bbox[3]))
+        # right_top
+        seg.append(int(bbox[2]))
+        seg.append(int(bbox[1]))
+
+        annotation_item['segmentation'].append(seg)
+
+        xywh = np.array(
+            [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]])
+        annotation_item['area'] = int(xywh[2] * xywh[3])
+        if difficult_flag == 1:
+            annotation_item['ignore'] = 0
+            annotation_item['iscrowd'] = 1
+        else:
+            annotation_item['ignore'] = 0
+            annotation_item['iscrowd'] = 0
+        annotation_item['image_id'] = int(image_id)
+        annotation_item['bbox'] = xywh.astype(int).tolist()
+        annotation_item['category_id'] = int(category_id)
+        annotation_item['id'] = int(annotation_id)
+        coco['annotations'].append(annotation_item)
+        return annotation_id + 1
+
+    for category_id, name in enumerate(voc_classes()):
+        category_item = dict()
+        category_item['supercategory'] = str('none')
+        category_item['id'] = int(category_id)
+        category_item['name'] = str(name)
+        coco['categories'].append(category_item)
+
+    for ann_dict in annotations:
+        file_name = ann_dict['filename']
+        ann = ann_dict['ann']
+        assert file_name not in image_set
+        image_item = dict()
+        image_item['id'] = int(image_id)
+        image_item['file_name'] = str(file_name)
+        image_item['height'] = int(ann_dict['height'])
+        image_item['width'] = int(ann_dict['width'])
+        coco['images'].append(image_item)
+        image_set.add(file_name)
+
+        bboxes = ann['bboxes'][:, :4]
+        labels = ann['labels']
+        for bbox_id in range(len(bboxes)):
+            bbox = bboxes[bbox_id]
+            label = labels[bbox_id]
+            annotation_id = addAnnItem(
+                annotation_id, image_id, label, bbox, difficult_flag=0)
+
+        bboxes_ignore = ann['bboxes_ignore'][:, :4]
+        labels_ignore = ann['labels_ignore']
+        for bbox_id in range(len(bboxes_ignore)):
+            bbox = bboxes_ignore[bbox_id]
+            label = labels_ignore[bbox_id]
+            annotation_id = addAnnItem(
+                annotation_id, image_id, label, bbox, difficult_flag=1)
+
+        image_id += 1
+
+    return coco
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert PASCAL VOC annotations to mmdetection format')
+    parser.add_argument('devkit_path', help='pascal voc devkit path')
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--out-format',
+        default='pkl',
+        choices=('pkl', 'coco'),
+        help='output format, "coco" indicates coco annotation format')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    devkit_path = args.devkit_path
+    out_dir = args.out_dir if args.out_dir else devkit_path
+    mmcv.mkdir_or_exist(out_dir)
+
+    years = []
+    if osp.isdir(osp.join(devkit_path, 'VOC2007')):
+        years.append('2007')
+    if osp.isdir(osp.join(devkit_path, 'VOC2012')):
+        years.append('2012')
+    if '2007' in years and '2012' in years:
+        years.append(['2007', '2012'])
+    if not years:
+        raise IOError(f'The devkit path {devkit_path} contains neither '
+                      '"VOC2007" nor "VOC2012" subfolder')
+    out_fmt = f'.{args.out_format}'
+    if args.out_format == 'coco':
+        out_fmt = '.json'
+    for year in years:
+        if year == '2007':
+            prefix = 'voc07'
+        elif year == '2012':
+            prefix = 'voc12'
+        elif year == ['2007', '2012']:
+            prefix = 'voc0712'
+        for split in ['train', 'val', 'trainval']:
+            dataset_name = prefix + '_' + split
+            print(f'processing {dataset_name} ...')
+            cvt_annotations(devkit_path, year, split,
+                            osp.join(out_dir, dataset_name + out_fmt))
+        if not isinstance(year, list):
+            dataset_name = prefix + '_test'
+            print(f'processing {dataset_name} ...')
+            cvt_annotations(devkit_path, year, 'test',
+                            osp.join(out_dir, dataset_name + out_fmt))
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deployment/mmdet2torchserve.py b/tools/deployment/mmdet2torchserve.py
new file mode 100755
index 0000000..70a081a
--- /dev/null
+++ b/tools/deployment/mmdet2torchserve.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import mmcv
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    package_model = None
+
+
+def mmdet2torchserve(
+    config_file: str,
+    checkpoint_file: str,
+    output_folder: str,
+    model_name: str,
+    model_version: str = '1.0',
+    force: bool = False,
+):
+    """Converts MMDetection model (config + checkpoint) to TorchServe `.mar`.
+
+    Args:
+        config_file:
+            In MMDetection config format.
+            The contents vary for each task repository.
+        checkpoint_file:
+            In MMDetection checkpoint format.
+            The contents vary for each task repository.
+        output_folder:
+            Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        model_name:
+            If not None, used for naming the `{model_name}.mar` file
+            that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version:
+            Model's version.
+        force:
+            If True, if there is an existing `{model_name}.mar`
+            file under `output_folder` it will be overwritten.
+    """
+    mmcv.mkdir_or_exist(output_folder)
+
+    config = mmcv.Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        config.dump(f'{tmpdir}/config.py')
+
+        args = Namespace(
+            **{
+                'model_file': f'{tmpdir}/config.py',
+                'serialized_file': checkpoint_file,
+                'handler': f'{Path(__file__).parent}/mmdet_handler.py',
+                'model_name': model_name or Path(checkpoint_file).stem,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': None,
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert MMDetection models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if package_model is None:
+        raise ImportError('`torch-model-archiver` is required.'
+                          'Try: pip install torch-model-archiver')
+
+    mmdet2torchserve(args.config, args.checkpoint, args.output_folder,
+                     args.model_name, args.model_version, args.force)
diff --git a/tools/deployment/mmdet_handler.py b/tools/deployment/mmdet_handler.py
new file mode 100755
index 0000000..18fc230
--- /dev/null
+++ b/tools/deployment/mmdet_handler.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import base64
+import os
+
+import mmcv
+import torch
+from ts.torch_handler.base_handler import BaseHandler
+
+from mmdet.apis import inference_detector, init_detector
+
+
+class MMdetHandler(BaseHandler):
+    threshold = 0.5
+
+    def initialize(self, context):
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+
+        self.model = init_detector(self.config_file, checkpoint, self.device)
+        self.initialized = True
+
+    def preprocess(self, data):
+        images = []
+
+        for row in data:
+            image = row.get('data') or row.get('body')
+            if isinstance(image, str):
+                image = base64.b64decode(image)
+            image = mmcv.imfrombytes(image)
+            images.append(image)
+
+        return images
+
+    def inference(self, data, *args, **kwargs):
+        results = inference_detector(self.model, data)
+        return results
+
+    def postprocess(self, data):
+        # Format output following the example ObjectDetectionHandler format
+        output = []
+        for image_index, image_result in enumerate(data):
+            output.append([])
+            if isinstance(image_result, tuple):
+                bbox_result, segm_result = image_result
+                if isinstance(segm_result, tuple):
+                    segm_result = segm_result[0]  # ms rcnn
+            else:
+                bbox_result, segm_result = image_result, None
+
+            for class_index, class_result in enumerate(bbox_result):
+                class_name = self.model.CLASSES[class_index]
+                for bbox in class_result:
+                    bbox_coords = bbox[:-1].tolist()
+                    score = float(bbox[-1])
+                    if score >= self.threshold:
+                        output[image_index].append({
+                            'class_name': class_name,
+                            'bbox': bbox_coords,
+                            'score': score
+                        })
+
+        return output
diff --git a/tools/deployment/onnx2tensorrt.py b/tools/deployment/onnx2tensorrt.py
new file mode 100755
index 0000000..b59e52a
--- /dev/null
+++ b/tools/deployment/onnx2tensorrt.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import warnings
+
+import numpy as np
+import onnx
+import torch
+from mmcv import Config
+from mmcv.tensorrt import is_tensorrt_plugin_loaded, onnx2trt, save_trt_engine
+
+from mmdet.core.export import preprocess_example_input
+from mmdet.core.export.model_wrappers import (ONNXRuntimeDetector,
+                                              TensorRTDetector)
+from mmdet.datasets import DATASETS
+
+
+def get_GiB(x: int):
+    """return x GiB."""
+    return x * (1 << 30)
+
+
+def onnx2tensorrt(onnx_file,
+                  trt_file,
+                  input_config,
+                  verify=False,
+                  show=False,
+                  workspace_size=1,
+                  verbose=False):
+    import tensorrt as trt
+    onnx_model = onnx.load(onnx_file)
+    max_shape = input_config['max_shape']
+    min_shape = input_config['min_shape']
+    opt_shape = input_config['opt_shape']
+    fp16_mode = False
+    # create trt engine and wrapper
+    opt_shape_dict = {'input': [min_shape, opt_shape, max_shape]}
+    max_workspace_size = get_GiB(workspace_size)
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        log_level=trt.Logger.VERBOSE if verbose else trt.Logger.ERROR,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+    save_dir, _ = osp.split(trt_file)
+    if save_dir:
+        os.makedirs(save_dir, exist_ok=True)
+    save_trt_engine(trt_engine, trt_file)
+    print(f'Successfully created TensorRT engine: {trt_file}')
+
+    if verify:
+        # prepare input
+        one_img, one_meta = preprocess_example_input(input_config)
+        img_list, img_meta_list = [one_img], [[one_meta]]
+        img_list = [_.cuda().contiguous() for _ in img_list]
+
+        # wrap ONNX and TensorRT model
+        onnx_model = ONNXRuntimeDetector(onnx_file, CLASSES, device_id=0)
+        trt_model = TensorRTDetector(trt_file, CLASSES, device_id=0)
+
+        # inference with wrapped model
+        with torch.no_grad():
+            onnx_results = onnx_model(
+                img_list, img_metas=img_meta_list, return_loss=False)[0]
+            trt_results = trt_model(
+                img_list, img_metas=img_meta_list, return_loss=False)[0]
+
+        if show:
+            out_file_ort, out_file_trt = None, None
+        else:
+            out_file_ort, out_file_trt = 'show-ort.png', 'show-trt.png'
+        show_img = one_meta['show_img']
+        score_thr = 0.3
+        onnx_model.show_result(
+            show_img,
+            onnx_results,
+            score_thr=score_thr,
+            show=True,
+            win_name='ONNXRuntime',
+            out_file=out_file_ort)
+        trt_model.show_result(
+            show_img,
+            trt_results,
+            score_thr=score_thr,
+            show=True,
+            win_name='TensorRT',
+            out_file=out_file_trt)
+        with_mask = trt_model.with_masks
+        # compare a part of result
+        if with_mask:
+            compare_pairs = list(zip(onnx_results, trt_results))
+        else:
+            compare_pairs = [(onnx_results, trt_results)]
+        err_msg = 'The numerical values are different between Pytorch' + \
+                  ' and ONNX, but it does not necessarily mean the' + \
+                  ' exported ONNX model is problematic.'
+        # check the numerical value
+        for onnx_res, pytorch_res in compare_pairs:
+            for o_res, p_res in zip(onnx_res, pytorch_res):
+                np.testing.assert_allclose(
+                    o_res, p_res, rtol=1e-03, atol=1e-05, err_msg=err_msg)
+        print('The numerical values are the same between Pytorch and ONNX')
+
+
+def parse_normalize_cfg(test_pipeline):
+    transforms = None
+    for pipeline in test_pipeline:
+        if 'transforms' in pipeline:
+            transforms = pipeline['transforms']
+            break
+    assert transforms is not None, 'Failed to find `transforms`'
+    norm_config_li = [_ for _ in transforms if _['type'] == 'Normalize']
+    assert len(norm_config_li) == 1, '`norm_config` should only have one'
+    norm_config = norm_config_li[0]
+    return norm_config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMDetection models from ONNX to TensorRT')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('model', help='Filename of input ONNX model')
+    parser.add_argument(
+        '--trt-file',
+        type=str,
+        default='tmp.trt',
+        help='Filename of output TensorRT engine')
+    parser.add_argument(
+        '--input-img', type=str, default='', help='Image for test')
+    parser.add_argument(
+        '--show', action='store_true', help='Whether to show output results')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='coco',
+        help='Dataset name. This argument is deprecated and will be \
+        removed in future releases.')
+    parser.add_argument(
+        '--verify',
+        action='store_true',
+        help='Verify the outputs of ONNXRuntime and TensorRT')
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Whether to verbose logging messages while creating \
+                TensorRT engine. Defaults to False.')
+    parser.add_argument(
+        '--to-rgb',
+        action='store_false',
+        help='Feed model with RGB or BGR image. Default is RGB. This \
+        argument is deprecated and will be removed in future releases.')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[400, 600],
+        help='Input size of the model')
+    parser.add_argument(
+        '--mean',
+        type=float,
+        nargs='+',
+        default=[123.675, 116.28, 103.53],
+        help='Mean value used for preprocess input data. This argument \
+        is deprecated and will be removed in future releases.')
+    parser.add_argument(
+        '--std',
+        type=float,
+        nargs='+',
+        default=[58.395, 57.12, 57.375],
+        help='Variance value used for preprocess input data. \
+        This argument is deprecated and will be removed in future releases.')
+    parser.add_argument(
+        '--min-shape',
+        type=int,
+        nargs='+',
+        default=None,
+        help='Minimum input size of the model in TensorRT')
+    parser.add_argument(
+        '--max-shape',
+        type=int,
+        nargs='+',
+        default=None,
+        help='Maximum input size of the model in TensorRT')
+    parser.add_argument(
+        '--workspace-size',
+        type=int,
+        default=1,
+        help='Max workspace size in GiB')
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+
+    assert is_tensorrt_plugin_loaded(), 'TensorRT plugin should be compiled.'
+    args = parse_args()
+    warnings.warn(
+        'Arguments like `--to-rgb`, `--mean`, `--std`, `--dataset` would be \
+        parsed directly from config file and are deprecated and will be \
+        removed in future releases.')
+    if not args.input_img:
+        args.input_img = osp.join(osp.dirname(__file__), '../../demo/demo.jpg')
+
+    cfg = Config.fromfile(args.config)
+
+    def parse_shape(shape):
+        if len(shape) == 1:
+            shape = (1, 3, shape[0], shape[0])
+        elif len(args.shape) == 2:
+            shape = (1, 3) + tuple(shape)
+        else:
+            raise ValueError('invalid input shape')
+        return shape
+
+    if args.shape:
+        input_shape = parse_shape(args.shape)
+    else:
+        img_scale = cfg.test_pipeline[1]['img_scale']
+        input_shape = (1, 3, img_scale[1], img_scale[0])
+
+    if not args.max_shape:
+        max_shape = input_shape
+    else:
+        max_shape = parse_shape(args.max_shape)
+
+    if not args.min_shape:
+        min_shape = input_shape
+    else:
+        min_shape = parse_shape(args.min_shape)
+
+    dataset = DATASETS.get(cfg.data.test['type'])
+    assert (dataset is not None)
+    CLASSES = dataset.CLASSES
+    normalize_cfg = parse_normalize_cfg(cfg.test_pipeline)
+
+    input_config = {
+        'min_shape': min_shape,
+        'opt_shape': input_shape,
+        'max_shape': max_shape,
+        'input_shape': input_shape,
+        'input_path': args.input_img,
+        'normalize_cfg': normalize_cfg
+    }
+    # Create TensorRT engine
+    onnx2tensorrt(
+        args.model,
+        args.trt_file,
+        input_config,
+        verify=args.verify,
+        show=args.show,
+        workspace_size=args.workspace_size,
+        verbose=args.verbose)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/tools/deployment/pytorch2onnx.py b/tools/deployment/pytorch2onnx.py
new file mode 100755
index 0000000..ee856cc
--- /dev/null
+++ b/tools/deployment/pytorch2onnx.py
@@ -0,0 +1,343 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import warnings
+from functools import partial
+
+import numpy as np
+import onnx
+import torch
+from mmcv import Config, DictAction
+
+from mmdet.core.export import build_model_from_cfg, preprocess_example_input
+from mmdet.core.export.model_wrappers import ONNXRuntimeDetector
+
+
+def pytorch2onnx(model,
+                 input_img,
+                 input_shape,
+                 normalize_cfg,
+                 opset_version=11,
+                 show=False,
+                 output_file='tmp.onnx',
+                 verify=False,
+                 test_img=None,
+                 do_simplify=False,
+                 dynamic_export=None,
+                 skip_postprocess=False):
+
+    input_config = {
+        'input_shape': input_shape,
+        'input_path': input_img,
+        'normalize_cfg': normalize_cfg
+    }
+    # prepare input
+    one_img, one_meta = preprocess_example_input(input_config)
+    img_list, img_meta_list = [one_img], [[one_meta]]
+
+    if skip_postprocess:
+        warnings.warn('Not all models support export onnx without post '
+                      'process, especially two stage detectors!')
+        model.forward = model.forward_dummy
+        torch.onnx.export(
+            model,
+            one_img,
+            output_file,
+            input_names=['input'],
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            do_constant_folding=True,
+            verbose=show,
+            opset_version=opset_version)
+
+        print(f'Successfully exported ONNX model without '
+              f'post process: {output_file}')
+        return
+
+    # replace original forward function
+    origin_forward = model.forward
+    model.forward = partial(
+        model.forward,
+        img_metas=img_meta_list,
+        return_loss=False,
+        rescale=False)
+
+    output_names = ['dets', 'labels']
+    if model.with_mask:
+        output_names.append('masks')
+    input_name = 'input'
+    dynamic_axes = None
+    if dynamic_export:
+        dynamic_axes = {
+            input_name: {
+                0: 'batch',
+                2: 'height',
+                3: 'width'
+            },
+            'dets': {
+                0: 'batch',
+                1: 'num_dets',
+            },
+            'labels': {
+                0: 'batch',
+                1: 'num_dets',
+            },
+        }
+        if model.with_mask:
+            dynamic_axes['masks'] = {0: 'batch', 1: 'num_dets'}
+
+    torch.onnx.export(
+        model,
+        img_list,
+        output_file,
+        input_names=[input_name],
+        output_names=output_names,
+        export_params=True,
+        keep_initializers_as_inputs=True,
+        do_constant_folding=True,
+        verbose=show,
+        opset_version=opset_version,
+        dynamic_axes=dynamic_axes)
+
+    model.forward = origin_forward
+
+    if do_simplify:
+        import onnxsim
+
+        from mmdet import digit_version
+
+        min_required_version = '0.4.0'
+        assert digit_version(onnxsim.__version__) >= digit_version(
+            min_required_version
+        ), f'Requires to install onnxsim>={min_required_version}'
+
+        model_opt, check_ok = onnxsim.simplify(output_file)
+        if check_ok:
+            onnx.save(model_opt, output_file)
+            print(f'Successfully simplified ONNX model: {output_file}')
+        else:
+            warnings.warn('Failed to simplify ONNX model.')
+    print(f'Successfully exported ONNX model: {output_file}')
+
+    if verify:
+        # check by onnx
+        onnx_model = onnx.load(output_file)
+        onnx.checker.check_model(onnx_model)
+
+        # wrap onnx model
+        onnx_model = ONNXRuntimeDetector(output_file, model.CLASSES, 0)
+        if dynamic_export:
+            # scale up to test dynamic shape
+            h, w = [int((_ * 1.5) // 32 * 32) for _ in input_shape[2:]]
+            h, w = min(1344, h), min(1344, w)
+            input_config['input_shape'] = (1, 3, h, w)
+
+        if test_img is None:
+            input_config['input_path'] = input_img
+
+        # prepare input once again
+        one_img, one_meta = preprocess_example_input(input_config)
+        img_list, img_meta_list = [one_img], [[one_meta]]
+
+        # get pytorch output
+        with torch.no_grad():
+            pytorch_results = model(
+                img_list,
+                img_metas=img_meta_list,
+                return_loss=False,
+                rescale=True)[0]
+
+        img_list = [_.cuda().contiguous() for _ in img_list]
+        if dynamic_export:
+            img_list = img_list + [_.flip(-1).contiguous() for _ in img_list]
+            img_meta_list = img_meta_list * 2
+        # get onnx output
+        onnx_results = onnx_model(
+            img_list, img_metas=img_meta_list, return_loss=False)[0]
+        # visualize predictions
+        score_thr = 0.3
+        if show:
+            out_file_ort, out_file_pt = None, None
+        else:
+            out_file_ort, out_file_pt = 'show-ort.png', 'show-pt.png'
+
+        show_img = one_meta['show_img']
+        model.show_result(
+            show_img,
+            pytorch_results,
+            score_thr=score_thr,
+            show=True,
+            win_name='PyTorch',
+            out_file=out_file_pt)
+        onnx_model.show_result(
+            show_img,
+            onnx_results,
+            score_thr=score_thr,
+            show=True,
+            win_name='ONNXRuntime',
+            out_file=out_file_ort)
+
+        # compare a part of result
+        if model.with_mask:
+            compare_pairs = list(zip(onnx_results, pytorch_results))
+        else:
+            compare_pairs = [(onnx_results, pytorch_results)]
+        err_msg = 'The numerical values are different between Pytorch' + \
+                  ' and ONNX, but it does not necessarily mean the' + \
+                  ' exported ONNX model is problematic.'
+        # check the numerical value
+        for onnx_res, pytorch_res in compare_pairs:
+            for o_res, p_res in zip(onnx_res, pytorch_res):
+                np.testing.assert_allclose(
+                    o_res, p_res, rtol=1e-03, atol=1e-05, err_msg=err_msg)
+        print('The numerical values are the same between Pytorch and ONNX')
+
+
+def parse_normalize_cfg(test_pipeline):
+    transforms = None
+    for pipeline in test_pipeline:
+        if 'transforms' in pipeline:
+            transforms = pipeline['transforms']
+            break
+    assert transforms is not None, 'Failed to find `transforms`'
+    norm_config_li = [_ for _ in transforms if _['type'] == 'Normalize']
+    assert len(norm_config_li) == 1, '`norm_config` should only have one'
+    norm_config = norm_config_li[0]
+    return norm_config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMDetection models to ONNX')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--input-img', type=str, help='Images for input')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show onnx graph and detection outputs')
+    parser.add_argument('--output-file', type=str, default='tmp.onnx')
+    parser.add_argument('--opset-version', type=int, default=11)
+    parser.add_argument(
+        '--test-img', type=str, default=None, help='Images for test')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='coco',
+        help='Dataset name. This argument is deprecated and will be removed \
+        in future releases.')
+    parser.add_argument(
+        '--verify',
+        action='store_true',
+        help='verify the onnx model output against pytorch output')
+    parser.add_argument(
+        '--simplify',
+        action='store_true',
+        help='Whether to simplify onnx model.')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[800, 1216],
+        help='input image size')
+    parser.add_argument(
+        '--mean',
+        type=float,
+        nargs='+',
+        default=[123.675, 116.28, 103.53],
+        help='mean value used for preprocess input data.This argument \
+        is deprecated and will be removed in future releases.')
+    parser.add_argument(
+        '--std',
+        type=float,
+        nargs='+',
+        default=[58.395, 57.12, 57.375],
+        help='variance value used for preprocess input data. '
+        'This argument is deprecated and will be removed in future releases.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='Override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--dynamic-export',
+        action='store_true',
+        help='Whether to export onnx with dynamic axis.')
+    parser.add_argument(
+        '--skip-postprocess',
+        action='store_true',
+        help='Whether to export model without post process. Experimental '
+        'option. We do not guarantee the correctness of the exported '
+        'model.')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    warnings.warn('Arguments like `--mean`, `--std`, `--dataset` would be \
+        parsed directly from config file and are deprecated and \
+        will be removed in future releases.')
+
+    assert args.opset_version == 11, 'MMDet only support opset 11 now'
+
+    try:
+        from mmcv.onnx.symbolic import register_extra_symbolics
+    except ModuleNotFoundError:
+        raise NotImplementedError('please update mmcv to version>=v1.0.4')
+    register_extra_symbolics(args.opset_version)
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.shape is None:
+        img_scale = cfg.test_pipeline[1]['img_scale']
+        input_shape = (1, 3, img_scale[1], img_scale[0])
+    elif len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (1, 3) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    # build the model and load checkpoint
+    model = build_model_from_cfg(args.config, args.checkpoint,
+                                 args.cfg_options)
+
+    if not args.input_img:
+        args.input_img = osp.join(osp.dirname(__file__), '../../demo/demo.jpg')
+
+    normalize_cfg = parse_normalize_cfg(cfg.test_pipeline)
+
+    # convert model to onnx file
+    pytorch2onnx(
+        model,
+        args.input_img,
+        input_shape,
+        normalize_cfg,
+        opset_version=args.opset_version,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify,
+        test_img=args.test_img,
+        do_simplify=args.simplify,
+        dynamic_export=args.dynamic_export,
+        skip_postprocess=args.skip_postprocess)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/tools/deployment/test.py b/tools/deployment/test.py
new file mode 100755
index 0000000..db8d696
--- /dev/null
+++ b/tools/deployment/test.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import warnings
+
+import mmcv
+from mmcv import Config, DictAction
+from mmcv.parallel import MMDataParallel
+
+from mmdet.apis import single_gpu_test
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.utils import compat_cfg
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) an ONNX model using ONNXRuntime')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('model', help='Input model file')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--backend',
+        required=True,
+        choices=['onnxruntime', 'tensorrt'],
+        help='Backend for input model to run. ')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold (default: 0.3)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg = compat_cfg(cfg)
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    if args.backend == 'onnxruntime':
+        from mmdet.core.export.model_wrappers import ONNXRuntimeDetector
+        model = ONNXRuntimeDetector(
+            args.model, class_names=dataset.CLASSES, device_id=0)
+    elif args.backend == 'tensorrt':
+        from mmdet.core.export.model_wrappers import TensorRTDetector
+        model = TensorRTDetector(
+            args.model, class_names=dataset.CLASSES, device_id=0)
+
+    model = MMDataParallel(model, device_ids=[0])
+    outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
+                              args.show_score_thr)
+
+    if args.out:
+        print(f'\nwriting results to {args.out}')
+        mmcv.dump(outputs, args.out)
+    kwargs = {} if args.eval_options is None else args.eval_options
+    if args.format_only:
+        dataset.format_results(outputs, **kwargs)
+    if args.eval:
+        eval_kwargs = cfg.get('evaluation', {}).copy()
+        # hard-code way to remove EvalHook args
+        for key in [
+                'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                'rule'
+        ]:
+            eval_kwargs.pop(key, None)
+        eval_kwargs.update(dict(metric=args.eval, **kwargs))
+        print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+    main()
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/tools/deployment/test_torchserver.py b/tools/deployment/test_torchserver.py
new file mode 100755
index 0000000..dd45234
--- /dev/null
+++ b/tools/deployment/test_torchserver.py
@@ -0,0 +1,74 @@
+from argparse import ArgumentParser
+
+import numpy as np
+import requests
+
+from mmdet.apis import inference_detector, init_detector, show_result_pyplot
+from mmdet.core import bbox2result
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model_name', help='The model name in the server')
+    parser.add_argument(
+        '--inference-addr',
+        default='127.0.0.1:8080',
+        help='Address and port of the inference server')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.5, help='bbox score threshold')
+    args = parser.parse_args()
+    return args
+
+
+def parse_result(input, model_class):
+    bbox = []
+    label = []
+    score = []
+    for anchor in input:
+        bbox.append(anchor['bbox'])
+        label.append(model_class.index(anchor['class_name']))
+        score.append([anchor['score']])
+    bboxes = np.append(bbox, score, axis=1)
+    labels = np.array(label)
+    result = bbox2result(bboxes, labels, len(model_class))
+    return result
+
+
+def main(args):
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+    # test a single image
+    model_result = inference_detector(model, args.img)
+    for i, anchor_set in enumerate(model_result):
+        anchor_set = anchor_set[anchor_set[:, 4] >= 0.5]
+        model_result[i] = anchor_set
+    # show the results
+    show_result_pyplot(
+        model,
+        args.img,
+        model_result,
+        score_thr=args.score_thr,
+        title='pytorch_result')
+    url = 'http://' + args.inference_addr + '/predictions/' + args.model_name
+    with open(args.img, 'rb') as image:
+        response = requests.post(url, image)
+    server_result = parse_result(response.json(), model.CLASSES)
+    show_result_pyplot(
+        model,
+        args.img,
+        server_result,
+        score_thr=args.score_thr,
+        title='server_result')
+
+    for i in range(len(model.CLASSES)):
+        assert np.allclose(model_result[i], server_result[i])
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/tools/dist_test.sh b/tools/dist_test.sh
new file mode 100755
index 0000000..dea131b
--- /dev/null
+++ b/tools/dist_test.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
new file mode 100755
index 0000000..039175e
--- /dev/null
+++ b/tools/dist_train.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --seed 0 \
+    --launcher pytorch ${@:3} \
+    --auto-scale-lr
diff --git a/tools/misc/browse_dataset.py b/tools/misc/browse_dataset.py
new file mode 100755
index 0000000..d9fb285
--- /dev/null
+++ b/tools/misc/browse_dataset.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+from collections import Sequence
+from pathlib import Path
+
+import mmcv
+import numpy as np
+from mmcv import Config, DictAction
+
+from mmdet.core.utils import mask2ndarray
+from mmdet.core.visualization import imshow_det_bboxes
+from mmdet.datasets.builder import build_dataset
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['DefaultFormatBundle', 'Normalize', 'Collect'],
+        help='skip some useless pipeline')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def retrieve_data_cfg(config_path, skip_type, cfg_options):
+
+    def skip_pipeline_steps(config):
+        config['pipeline'] = [
+            x for x in config.pipeline if x['type'] not in skip_type
+        ]
+
+    cfg = Config.fromfile(config_path)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    train_data_cfg = cfg.data.train
+    while 'dataset' in train_data_cfg and train_data_cfg[
+            'type'] != 'MultiImageMixDataset':
+        train_data_cfg = train_data_cfg['dataset']
+
+    if isinstance(train_data_cfg, Sequence):
+        [skip_pipeline_steps(c) for c in train_data_cfg]
+    else:
+        skip_pipeline_steps(train_data_cfg)
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+    cfg = retrieve_data_cfg(args.config, args.skip_type, args.cfg_options)
+
+    if 'gt_semantic_seg' in cfg.train_pipeline[-1]['keys']:
+        cfg.data.train.pipeline = [
+            p for p in cfg.data.train.pipeline if p['type'] != 'SegRescale'
+        ]
+    dataset = build_dataset(cfg.data.train)
+
+    progress_bar = mmcv.ProgressBar(len(dataset))
+
+    for item in dataset:
+        filename = os.path.join(args.output_dir,
+                                Path(item['filename']).name
+                                ) if args.output_dir is not None else None
+
+        gt_bboxes = item['gt_bboxes']
+        gt_labels = item['gt_labels']
+        gt_masks = item.get('gt_masks', None)
+        if gt_masks is not None:
+            gt_masks = mask2ndarray(gt_masks)
+
+        gt_seg = item.get('gt_semantic_seg', None)
+        if gt_seg is not None:
+            pad_value = 255  # the padding value of gt_seg
+            sem_labels = np.unique(gt_seg)
+            all_labels = np.concatenate((gt_labels, sem_labels), axis=0)
+            all_labels, counts = np.unique(all_labels, return_counts=True)
+            stuff_labels = all_labels[np.logical_and(counts < 2,
+                                                     all_labels != pad_value)]
+            stuff_masks = gt_seg[None] == stuff_labels[:, None, None]
+            gt_labels = np.concatenate((gt_labels, stuff_labels), axis=0)
+            gt_masks = np.concatenate((gt_masks, stuff_masks.astype(np.uint8)),
+                                      axis=0)
+            # If you need to show the bounding boxes,
+            # please comment the following line
+            gt_bboxes = None
+
+        imshow_det_bboxes(
+            item['img'],
+            gt_bboxes,
+            gt_labels,
+            gt_masks,
+            class_names=dataset.CLASSES,
+            show=not args.not_show,
+            wait_time=args.show_interval,
+            out_file=filename,
+            bbox_color=dataset.PALETTE,
+            text_color=(200, 200, 200),
+            mask_color=dataset.PALETTE)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/download_dataset.py b/tools/misc/download_dataset.py
new file mode 100755
index 0000000..ac37cd2
--- /dev/null
+++ b/tools/misc/download_dataset.py
@@ -0,0 +1,190 @@
+import argparse
+import tarfile
+from itertools import repeat
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from tarfile import TarFile
+from zipfile import ZipFile
+
+import torch
+from mmcv.utils.path import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Download datasets for training')
+    parser.add_argument(
+        '--dataset-name', type=str, help='dataset name', default='coco2017')
+    parser.add_argument(
+        '--save-dir',
+        type=str,
+        help='the dir to save dataset',
+        default='data/coco')
+    parser.add_argument(
+        '--unzip',
+        action='store_true',
+        help='whether unzip dataset or not, zipped files will be saved')
+    parser.add_argument(
+        '--delete',
+        action='store_true',
+        help='delete the download zipped files')
+    parser.add_argument(
+        '--threads', type=int, help='number of threading', default=4)
+    args = parser.parse_args()
+    return args
+
+
+def download(url, dir, unzip=True, delete=False, threads=1):
+
+    def download_one(url, dir):
+        f = dir / Path(url).name
+        if Path(url).is_file():
+            Path(url).rename(f)
+        elif not f.exists():
+            print(f'Downloading {url} to {f}')
+            torch.hub.download_url_to_file(url, f, progress=True)
+        if unzip and f.suffix in ('.zip', '.tar'):
+            print(f'Unzipping {f.name}')
+            if f.suffix == '.zip':
+                ZipFile(f).extractall(path=dir)
+            elif f.suffix == '.tar':
+                TarFile(f).extractall(path=dir)
+            if delete:
+                f.unlink()
+                print(f'Delete {f}')
+
+    dir = Path(dir)
+    if threads > 1:
+        pool = ThreadPool(threads)
+        pool.imap(lambda x: download_one(*x), zip(url, repeat(dir)))
+        pool.close()
+        pool.join()
+    else:
+        for u in [url] if isinstance(url, (str, Path)) else url:
+            download_one(u, dir)
+
+
+def download_objects365v2(url, dir, unzip=True, delete=False, threads=1):
+
+    def download_single(url, dir):
+
+        if 'train' in url:
+            saving_dir = dir / Path('train_zip')
+            mkdir_or_exist(saving_dir)
+            f = saving_dir / Path(url).name
+
+            unzip_dir = dir / Path('train')
+            mkdir_or_exist(unzip_dir)
+        elif 'val' in url:
+            saving_dir = dir / Path('val')
+            mkdir_or_exist(saving_dir)
+            f = saving_dir / Path(url).name
+
+            unzip_dir = dir / Path('val')
+            mkdir_or_exist(unzip_dir)
+        else:
+            raise NotImplementedError
+
+        if Path(url).is_file():
+            Path(url).rename(f)
+        elif not f.exists():
+            print(f'Downloading {url} to {f}')
+            torch.hub.download_url_to_file(url, f, progress=True)
+
+        if unzip and str(f).endswith('.tar.gz'):
+            print(f'Unzipping {f.name}')
+            tar = tarfile.open(f)
+            tar.extractall(path=unzip_dir)
+            if delete:
+                f.unlink()
+                print(f'Delete {f}')
+
+    # process annotations
+    full_url = []
+    for _url in url:
+        if 'zhiyuan_objv2_train.tar.gz' in _url or \
+                'zhiyuan_objv2_val.json' in _url:
+            full_url.append(_url)
+        elif 'train' in _url:
+            for i in range(51):
+                full_url.append(f'{_url}patch{i}.tar.gz')
+        elif 'val/images/v1' in _url:
+            for i in range(16):
+                full_url.append(f'{_url}patch{i}.tar.gz')
+        elif 'val/images/v2' in _url:
+            for i in range(16, 44):
+                full_url.append(f'{_url}patch{i}.tar.gz')
+        else:
+            raise NotImplementedError
+
+    dir = Path(dir)
+    if threads > 1:
+        pool = ThreadPool(threads)
+        pool.imap(lambda x: download_single(*x), zip(full_url, repeat(dir)))
+        pool.close()
+        pool.join()
+    else:
+        for u in full_url:
+            download_single(u, dir)
+
+
+def main():
+    args = parse_args()
+    path = Path(args.save_dir)
+    if not path.exists():
+        path.mkdir(parents=True, exist_ok=True)
+    data2url = dict(
+        # TODO: Support for downloading Panoptic Segmentation of COCO
+        coco2017=[
+            'http://images.cocodataset.org/zips/train2017.zip',
+            'http://images.cocodataset.org/zips/val2017.zip',
+            'http://images.cocodataset.org/zips/test2017.zip',
+            'http://images.cocodataset.org/annotations/' +
+            'annotations_trainval2017.zip'
+        ],
+        lvis=[
+            'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip',  # noqa
+            'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip',  # noqa
+        ],
+        voc2007=[
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',  # noqa
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',  # noqa
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar',  # noqa
+        ],
+        # Note: There is no download link for Objects365-V1 right now. If you
+        # would like to download Objects365-V1, please visit
+        # http://www.objects365.org/ to concat the author.
+        objects365v2=[
+            # training annotations
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/train/zhiyuan_objv2_train.tar.gz',  # noqa
+            # validation annotations
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/zhiyuan_objv2_val.json',  # noqa
+            # training url root
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/train/',  # noqa
+            # validation url root_1
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/images/v1/',  # noqa
+            # validation url root_2
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/images/v2/'  # noqa
+        ])
+    url = data2url.get(args.dataset_name, None)
+    if url is None:
+        print('Only support COCO, VOC, LVIS, and Objects365v2 now!')
+        return
+    if args.dataset_name == 'objects365v2':
+        download_objects365v2(
+            url,
+            dir=path,
+            unzip=args.unzip,
+            delete=args.delete,
+            threads=args.threads)
+    else:
+        download(
+            url,
+            dir=path,
+            unzip=args.unzip,
+            delete=args.delete,
+            threads=args.threads)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/gen_coco_panoptic_test_info.py b/tools/misc/gen_coco_panoptic_test_info.py
new file mode 100755
index 0000000..5ad315d
--- /dev/null
+++ b/tools/misc/gen_coco_panoptic_test_info.py
@@ -0,0 +1,34 @@
+import argparse
+import os.path as osp
+
+import mmcv
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate COCO test image information '
+        'for COCO panoptic segmentation.')
+    parser.add_argument('data_root', help='Path to COCO annotation directory.')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+    data_root = args.data_root
+    val_info = mmcv.load(osp.join(data_root, 'panoptic_val2017.json'))
+    test_old_info = mmcv.load(
+        osp.join(data_root, 'image_info_test-dev2017.json'))
+
+    # replace categories from image_info_test-dev2017.json
+    # with categories from panoptic_val2017.json which
+    # has attribute `isthing`.
+    test_info = test_old_info
+    test_info.update({'categories': val_info['categories']})
+    mmcv.dump(test_info,
+              osp.join(data_root, 'panoptic_image_info_test-dev2017.json'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/get_image_metas.py b/tools/misc/get_image_metas.py
new file mode 100755
index 0000000..4a5570f
--- /dev/null
+++ b/tools/misc/get_image_metas.py
@@ -0,0 +1,117 @@
+
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Get test image metas on a specific dataset.
+
+Here is an example to run this script.
+
+Example:
+    python tools/misc/get_image_metas.py ${CONFIG} \
+    --out ${OUTPUT FILE NAME}
+"""
+import argparse
+import csv
+import os.path as osp
+from multiprocessing import Pool
+
+import mmcv
+from mmcv import Config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Collect image metas')
+    parser.add_argument('config', help='Config file path')
+    parser.add_argument(
+        '--out',
+        default='validation-image-metas.pkl',
+        help='The output image metas file name. The save dir is in the '
+        'same directory as `dataset.ann_file` path')
+    parser.add_argument(
+        '--nproc',
+        default=4,
+        type=int,
+        help='Processes used for get image metas')
+    args = parser.parse_args()
+    return args
+
+
+def get_metas_from_csv_style_ann_file(ann_file):
+    data_infos = []
+    cp_filename = None
+    with open(ann_file, 'r') as f:
+        reader = csv.reader(f)
+        for i, line in enumerate(reader):
+            if i == 0:
+                continue
+            img_id = line[0]
+            filename = f'{img_id}.jpg'
+            if filename != cp_filename:
+                data_infos.append(dict(filename=filename))
+                cp_filename = filename
+    return data_infos
+
+
+def get_metas_from_txt_style_ann_file(ann_file):
+    with open(ann_file) as f:
+        lines = f.readlines()
+    i = 0
+    data_infos = []
+    while i < len(lines):
+        filename = lines[i].rstrip()
+        data_infos.append(dict(filename=filename))
+        skip_lines = int(lines[i + 2]) + 3
+        i += skip_lines
+    return data_infos
+
+
+def get_image_metas(data_info, img_prefix):
+    file_client = mmcv.FileClient(backend='disk')
+    filename = data_info.get('filename', None)
+    if filename is not None:
+        if img_prefix is not None:
+            filename = osp.join(img_prefix, filename)
+        img_bytes = file_client.get(filename)
+        img = mmcv.imfrombytes(img_bytes, flag='color')
+        meta = dict(filename=filename, ori_shape=img.shape)
+    else:
+        raise NotImplementedError('Missing `filename` in data_info')
+    return meta
+
+
+def main():
+    args = parse_args()
+    assert args.out.endswith('pkl'), 'The output file name must be pkl suffix'
+
+    # load config files
+    cfg = Config.fromfile(args.config)
+    ann_file = cfg.data.test.ann_file
+    img_prefix = cfg.data.test.img_prefix
+
+    print(f'{"-" * 5} Start Processing {"-" * 5}')
+    if ann_file.endswith('csv'):
+        data_infos = get_metas_from_csv_style_ann_file(ann_file)
+    elif ann_file.endswith('txt'):
+        data_infos = get_metas_from_txt_style_ann_file(ann_file)
+    else:
+        shuffix = ann_file.split('.')[-1]
+        raise NotImplementedError('File name must be csv or txt suffix but '
+                                  f'get {shuffix}')
+
+    print(f'Successfully load annotation file from {ann_file}')
+    print(f'Processing {len(data_infos)} images...')
+    pool = Pool(args.nproc)
+    # get image metas with multiple processes
+    image_metas = pool.starmap(
+        get_image_metas,
+        zip(data_infos, [img_prefix for _ in range(len(data_infos))]),
+    )
+    pool.close()
+
+    # save image metas
+    root_path = cfg.data.test.ann_file.rsplit('/', 1)[0]
+    save_path = osp.join(root_path, args.out)
+    mmcv.dump(image_metas, save_path)
+    print(f'Image meta file save to: {save_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/print_config.py b/tools/misc/print_config.py
new file mode 100755
index 0000000..f10f538
--- /dev/null
+++ b/tools/misc/print_config.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import warnings
+
+from mmcv import Config, DictAction
+
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/split_coco.py b/tools/misc/split_coco.py
new file mode 100755
index 0000000..78cc655
--- /dev/null
+++ b/tools/misc/split_coco.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import mmcv
+import numpy as np
+
+prog_description = '''K-Fold coco split.
+
+To split coco data for semi-supervised object detection:
+    python tools/misc/split_coco.py
+'''
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--data-root',
+        type=str,
+        help='The data root of coco dataset.',
+        default='./data/coco/')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        help='The output directory of coco semi-supervised annotations.',
+        default='./data/coco_semi_annos/')
+    parser.add_argument(
+        '--labeled-percent',
+        type=float,
+        nargs='+',
+        help='The percentage of labeled data in the training set.',
+        default=[1, 2, 5, 10])
+    parser.add_argument(
+        '--fold',
+        type=int,
+        help='K-fold cross validation for semi-supervised object detection.',
+        default=5)
+    args = parser.parse_args()
+    return args
+
+
+def split_coco(data_root, out_dir, percent, fold):
+    """Split COCO data for Semi-supervised object detection.
+
+    Args:
+        data_root (str): The data root of coco dataset.
+        out_dir (str): The output directory of coco semi-supervised
+            annotations.
+        percent (float): The percentage of labeled data in the training set.
+        fold (int): The fold of dataset and set as random seed for data split.
+    """
+
+    def save_anns(name, images, annotations):
+        sub_anns = dict()
+        sub_anns['images'] = images
+        sub_anns['annotations'] = annotations
+        sub_anns['licenses'] = anns['licenses']
+        sub_anns['categories'] = anns['categories']
+        sub_anns['info'] = anns['info']
+
+        mmcv.mkdir_or_exist(out_dir)
+        mmcv.dump(sub_anns, f'{out_dir}/{name}.json')
+
+    # set random seed with the fold
+    np.random.seed(fold)
+    ann_file = osp.join(data_root, 'annotations/instances_train2017.json')
+    anns = mmcv.load(ann_file)
+
+    image_list = anns['images']
+    labeled_total = int(percent / 100. * len(image_list))
+    labeled_inds = set(
+        np.random.choice(range(len(image_list)), size=labeled_total))
+    labeled_ids, labeled_images, unlabeled_images = [], [], []
+
+    for i in range(len(image_list)):
+        if i in labeled_inds:
+            labeled_images.append(image_list[i])
+            labeled_ids.append(image_list[i]['id'])
+        else:
+            unlabeled_images.append(image_list[i])
+
+    # get all annotations of labeled images
+    labeled_ids = set(labeled_ids)
+    labeled_annotations, unlabeled_annotations = [], []
+
+    for ann in anns['annotations']:
+        if ann['image_id'] in labeled_ids:
+            labeled_annotations.append(ann)
+        else:
+            unlabeled_annotations.append(ann)
+
+    # save labeled and unlabeled
+    labeled_name = f'instances_train2017.{fold}@{percent}'
+    unlabeled_name = f'instances_train2017.{fold}@{percent}-unlabeled'
+
+    save_anns(labeled_name, labeled_images, labeled_annotations)
+    save_anns(unlabeled_name, unlabeled_images, unlabeled_annotations)
+
+
+def multi_wrapper(args):
+    return split_coco(*args)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    arguments_list = [(args.data_root, args.out_dir, p, f)
+                      for f in range(1, args.fold + 1)
+                      for p in args.labeled_percent]
+    mmcv.track_parallel_progress(multi_wrapper, arguments_list, args.fold)
diff --git a/tools/model_converters/detectron2pytorch.py b/tools/model_converters/detectron2pytorch.py
new file mode 100755
index 0000000..b7264d5
--- /dev/null
+++ b/tools/model_converters/detectron2pytorch.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import mmcv
+import torch
+
+arch_settings = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3)}
+
+
+def convert_bn(blobs, state_dict, caffe_name, torch_name, converted_names):
+    # detectron replace bn with affine channel layer
+    state_dict[torch_name + '.bias'] = torch.from_numpy(blobs[caffe_name +
+                                                              '_b'])
+    state_dict[torch_name + '.weight'] = torch.from_numpy(blobs[caffe_name +
+                                                                '_s'])
+    bn_size = state_dict[torch_name + '.weight'].size()
+    state_dict[torch_name + '.running_mean'] = torch.zeros(bn_size)
+    state_dict[torch_name + '.running_var'] = torch.ones(bn_size)
+    converted_names.add(caffe_name + '_b')
+    converted_names.add(caffe_name + '_s')
+
+
+def convert_conv_fc(blobs, state_dict, caffe_name, torch_name,
+                    converted_names):
+    state_dict[torch_name + '.weight'] = torch.from_numpy(blobs[caffe_name +
+                                                                '_w'])
+    converted_names.add(caffe_name + '_w')
+    if caffe_name + '_b' in blobs:
+        state_dict[torch_name + '.bias'] = torch.from_numpy(blobs[caffe_name +
+                                                                  '_b'])
+        converted_names.add(caffe_name + '_b')
+
+
+def convert(src, dst, depth):
+    """Convert keys in detectron pretrained ResNet models to pytorch style."""
+    # load arch_settings
+    if depth not in arch_settings:
+        raise ValueError('Only support ResNet-50 and ResNet-101 currently')
+    block_nums = arch_settings[depth]
+    # load caffe model
+    caffe_model = mmcv.load(src, encoding='latin1')
+    blobs = caffe_model['blobs'] if 'blobs' in caffe_model else caffe_model
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    convert_conv_fc(blobs, state_dict, 'conv1', 'conv1', converted_names)
+    convert_bn(blobs, state_dict, 'res_conv1_bn', 'bn1', converted_names)
+    for i in range(1, len(block_nums) + 1):
+        for j in range(block_nums[i - 1]):
+            if j == 0:
+                convert_conv_fc(blobs, state_dict, f'res{i + 1}_{j}_branch1',
+                                f'layer{i}.{j}.downsample.0', converted_names)
+                convert_bn(blobs, state_dict, f'res{i + 1}_{j}_branch1_bn',
+                           f'layer{i}.{j}.downsample.1', converted_names)
+            for k, letter in enumerate(['a', 'b', 'c']):
+                convert_conv_fc(blobs, state_dict,
+                                f'res{i + 1}_{j}_branch2{letter}',
+                                f'layer{i}.{j}.conv{k+1}', converted_names)
+                convert_bn(blobs, state_dict,
+                           f'res{i + 1}_{j}_branch2{letter}_bn',
+                           f'layer{i}.{j}.bn{k + 1}', converted_names)
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'Not Convert: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    parser.add_argument('depth', type=int, help='ResNet model depth')
+    args = parser.parse_args()
+    convert(args.src, args.dst, args.depth)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/publish_model.py b/tools/model_converters/publish_model.py
new file mode 100755
index 0000000..219fcdf
--- /dev/null
+++ b/tools/model_converters/publish_model.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    if torch.__version__ >= '1.6':
+        torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False)
+    else:
+        torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    if out_file.endswith('.pth'):
+        out_file_name = out_file[:-4]
+    else:
+        out_file_name = out_file
+    final_file = out_file_name + f'-{sha[:8]}.pth'
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/regnet2mmdet.py b/tools/model_converters/regnet2mmdet.py
new file mode 100755
index 0000000..fbf8c8f
--- /dev/null
+++ b/tools/model_converters/regnet2mmdet.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('stem.conv', 'conv1')
+    new_key = new_key.replace('stem.bn', 'bn1')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('head.fc', 'fc')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+    split_keys = model_key.split('.')
+    layer, block, module = split_keys[:3]
+    block_id = int(block[1:])
+    layer_name = f'layer{int(layer[1:])}'
+    block_name = f'{block_id - 1}'
+
+    if block_id == 1 and module == 'bn':
+        new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+    elif block_id == 1 and module == 'proj':
+        new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+    elif module == 'f':
+        if split_keys[3] == 'a_bn':
+            module_name = 'bn1'
+        elif split_keys[3] == 'b_bn':
+            module_name = 'bn2'
+        elif split_keys[3] == 'c_bn':
+            module_name = 'bn3'
+        elif split_keys[3] == 'a':
+            module_name = 'conv1'
+        elif split_keys[3] == 'b':
+            module_name = 'conv2'
+        elif split_keys[3] == 'c':
+            module_name = 'conv3'
+        new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+    else:
+        raise ValueError(f'Unsupported conversion of key {model_key}')
+    print(f'Convert {model_key} to {new_key}')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+
+
+def convert(src, dst):
+    """Convert keys in pycls pretrained RegNet models to mmdet style."""
+    # load caffe model
+    regnet_model = torch.load(src)
+    blobs = regnet_model['model_state']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    for key, weight in blobs.items():
+        if 'stem' in key:
+            convert_stem(key, weight, state_dict, converted_names)
+        elif 'head' in key:
+            convert_head(key, weight, state_dict, converted_names)
+        elif key.startswith('s'):
+            convert_reslayer(key, weight, state_dict, converted_names)
+
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'not converted: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/selfsup2mmdet.py b/tools/model_converters/selfsup2mmdet.py
new file mode 100755
index 0000000..bc8cce1
--- /dev/null
+++ b/tools/model_converters/selfsup2mmdet.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+
+
+def moco_convert(src, dst):
+    """Convert keys in pycls pretrained moco models to mmdet style."""
+    # load caffe model
+    moco_model = torch.load(src)
+    blobs = moco_model['state_dict']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    for k, v in blobs.items():
+        if not k.startswith('module.encoder_q.'):
+            continue
+        old_k = k
+        k = k.replace('module.encoder_q.', '')
+        state_dict[k] = v
+        print(old_k, '->', k)
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    parser.add_argument(
+        '--selfsup', type=str, choices=['moco', 'swav'], help='save path')
+    args = parser.parse_args()
+    if args.selfsup == 'moco':
+        moco_convert(args.src, args.dst)
+    elif args.selfsup == 'swav':
+        print('SWAV does not need to convert the keys')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/upgrade_model_version.py b/tools/model_converters/upgrade_model_version.py
new file mode 100755
index 0000000..36ee607
--- /dev/null
+++ b/tools/model_converters/upgrade_model_version.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import re
+import tempfile
+from collections import OrderedDict
+
+import torch
+from mmcv import Config
+
+
+def is_head(key):
+    valid_head_list = [
+        'bbox_head', 'mask_head', 'semantic_head', 'grid_head', 'mask_iou_head'
+    ]
+
+    return any(key.startswith(h) for h in valid_head_list)
+
+
+def parse_config(config_strings):
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+    is_two_stage = True
+    is_ssd = False
+    is_retina = False
+    reg_cls_agnostic = False
+    if 'rpn_head' not in config.model:
+        is_two_stage = False
+        # check whether it is SSD
+        if config.model.bbox_head.type == 'SSDHead':
+            is_ssd = True
+        elif config.model.bbox_head.type == 'RetinaHead':
+            is_retina = True
+    elif isinstance(config.model['bbox_head'], list):
+        reg_cls_agnostic = True
+    elif 'reg_class_agnostic' in config.model.bbox_head:
+        reg_cls_agnostic = config.model.bbox_head \
+            .reg_class_agnostic
+    temp_file.close()
+    return is_two_stage, is_ssd, is_retina, reg_cls_agnostic
+
+
+def reorder_cls_channel(val, num_classes=81):
+    # bias
+    if val.dim() == 1:
+        new_val = torch.cat((val[1:], val[:1]), dim=0)
+    # weight
+    else:
+        out_channels, in_channels = val.shape[:2]
+        # conv_cls for softmax output
+        if out_channels != num_classes and out_channels % num_classes == 0:
+            new_val = val.reshape(-1, num_classes, in_channels, *val.shape[2:])
+            new_val = torch.cat((new_val[:, 1:], new_val[:, :1]), dim=1)
+            new_val = new_val.reshape(val.size())
+        # fc_cls
+        elif out_channels == num_classes:
+            new_val = torch.cat((val[1:], val[:1]), dim=0)
+        # agnostic | retina_cls | rpn_cls
+        else:
+            new_val = val
+
+    return new_val
+
+
+def truncate_cls_channel(val, num_classes=81):
+
+    # bias
+    if val.dim() == 1:
+        if val.size(0) % num_classes == 0:
+            new_val = val[:num_classes - 1]
+        else:
+            new_val = val
+    # weight
+    else:
+        out_channels, in_channels = val.shape[:2]
+        # conv_logits
+        if out_channels % num_classes == 0:
+            new_val = val.reshape(num_classes, in_channels, *val.shape[2:])[1:]
+            new_val = new_val.reshape(-1, *val.shape[1:])
+        # agnostic
+        else:
+            new_val = val
+
+    return new_val
+
+
+def truncate_reg_channel(val, num_classes=81):
+    # bias
+    if val.dim() == 1:
+        # fc_reg | rpn_reg
+        if val.size(0) % num_classes == 0:
+            new_val = val.reshape(num_classes, -1)[:num_classes - 1]
+            new_val = new_val.reshape(-1)
+        # agnostic
+        else:
+            new_val = val
+    # weight
+    else:
+        out_channels, in_channels = val.shape[:2]
+        # fc_reg | rpn_reg
+        if out_channels % num_classes == 0:
+            new_val = val.reshape(num_classes, -1, in_channels,
+                                  *val.shape[2:])[1:]
+            new_val = new_val.reshape(-1, *val.shape[1:])
+        # agnostic
+        else:
+            new_val = val
+
+    return new_val
+
+
+def convert(in_file, out_file, num_classes):
+    """Convert keys in checkpoints.
+
+    There can be some breaking changes during the development of mmdetection,
+    and this tool is used for upgrading checkpoints trained with old versions
+    to the latest one.
+    """
+    checkpoint = torch.load(in_file)
+    in_state_dict = checkpoint.pop('state_dict')
+    out_state_dict = OrderedDict()
+    meta_info = checkpoint['meta']
+    is_two_stage, is_ssd, is_retina, reg_cls_agnostic = parse_config(
+        '#' + meta_info['config'])
+    if meta_info['mmdet_version'] <= '0.5.3' and is_retina:
+        upgrade_retina = True
+    else:
+        upgrade_retina = False
+
+    # MMDetection v2.5.0 unifies the class order in RPN
+    # if the model is trained in version<v2.5.0
+    # The RPN model should be upgraded to be used in version>=2.5.0
+    if meta_info['mmdet_version'] < '2.5.0':
+        upgrade_rpn = True
+    else:
+        upgrade_rpn = False
+
+    for key, val in in_state_dict.items():
+        new_key = key
+        new_val = val
+        if is_two_stage and is_head(key):
+            new_key = 'roi_head.{}'.format(key)
+
+        # classification
+        if upgrade_rpn:
+            m = re.search(
+                r'(conv_cls|retina_cls|rpn_cls|fc_cls|fcos_cls|'
+                r'fovea_cls).(weight|bias)', new_key)
+        else:
+            m = re.search(
+                r'(conv_cls|retina_cls|fc_cls|fcos_cls|'
+                r'fovea_cls).(weight|bias)', new_key)
+        if m is not None:
+            print(f'reorder cls channels of {new_key}')
+            new_val = reorder_cls_channel(val, num_classes)
+
+        # regression
+        if upgrade_rpn:
+            m = re.search(r'(fc_reg).(weight|bias)', new_key)
+        else:
+            m = re.search(r'(fc_reg|rpn_reg).(weight|bias)', new_key)
+        if m is not None and not reg_cls_agnostic:
+            print(f'truncate regression channels of {new_key}')
+            new_val = truncate_reg_channel(val, num_classes)
+
+        # mask head
+        m = re.search(r'(conv_logits).(weight|bias)', new_key)
+        if m is not None:
+            print(f'truncate mask prediction channels of {new_key}')
+            new_val = truncate_cls_channel(val, num_classes)
+
+        m = re.search(r'(cls_convs|reg_convs).\d.(weight|bias)', key)
+        # Legacy issues in RetinaNet since V1.x
+        # Use ConvModule instead of nn.Conv2d in RetinaNet
+        # cls_convs.0.weight -> cls_convs.0.conv.weight
+        if m is not None and upgrade_retina:
+            param = m.groups()[1]
+            new_key = key.replace(param, f'conv.{param}')
+            out_state_dict[new_key] = val
+            print(f'rename the name of {key} to {new_key}')
+            continue
+
+        m = re.search(r'(cls_convs).\d.(weight|bias)', key)
+        if m is not None and is_ssd:
+            print(f'reorder cls channels of {new_key}')
+            new_val = reorder_cls_channel(val, num_classes)
+
+        out_state_dict[new_key] = new_val
+    checkpoint['state_dict'] = out_state_dict
+    torch.save(checkpoint, out_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Upgrade model version')
+    parser.add_argument('in_file', help='input checkpoint file')
+    parser.add_argument('out_file', help='output checkpoint file')
+    parser.add_argument(
+        '--num-classes',
+        type=int,
+        default=81,
+        help='number of classes of the original model')
+    args = parser.parse_args()
+    convert(args.in_file, args.out_file, args.num_classes)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/upgrade_ssd_version.py b/tools/model_converters/upgrade_ssd_version.py
new file mode 100755
index 0000000..befff45
--- /dev/null
+++ b/tools/model_converters/upgrade_ssd_version.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+from collections import OrderedDict
+
+import torch
+from mmcv import Config
+
+
+def parse_config(config_strings):
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+    # check whether it is SSD
+    if config.model.bbox_head.type != 'SSDHead':
+        raise AssertionError('This is not a SSD model.')
+
+
+def convert(in_file, out_file):
+    checkpoint = torch.load(in_file)
+    in_state_dict = checkpoint.pop('state_dict')
+    out_state_dict = OrderedDict()
+    meta_info = checkpoint['meta']
+    parse_config('#' + meta_info['config'])
+    for key, value in in_state_dict.items():
+        if 'extra' in key:
+            layer_idx = int(key.split('.')[2])
+            new_key = 'neck.extra_layers.{}.{}.conv.'.format(
+                layer_idx // 2, layer_idx % 2) + key.split('.')[-1]
+        elif 'l2_norm' in key:
+            new_key = 'neck.l2_norm.weight'
+        elif 'bbox_head' in key:
+            new_key = key[:21] + '.0' + key[21:]
+        else:
+            new_key = key
+        out_state_dict[new_key] = value
+    checkpoint['state_dict'] = out_state_dict
+
+    if torch.__version__ >= '1.6':
+        torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False)
+    else:
+        torch.save(checkpoint, out_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Upgrade SSD version')
+    parser.add_argument('in_file', help='input checkpoint file')
+    parser.add_argument('out_file', help='output checkpoint file')
+
+    args = parser.parse_args()
+    convert(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/slurm_test.sh b/tools/slurm_test.sh
new file mode 100755
index 0000000..6dd67e5
--- /dev/null
+++ b/tools/slurm_test.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/tools/slurm_train.sh b/tools/slurm_train.sh
new file mode 100755
index 0000000..b3feb3d
--- /dev/null
+++ b/tools/slurm_train.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+WORK_DIR=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:5}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
diff --git a/tools/test.py b/tools/test.py
new file mode 100755
index 0000000..5051c2f
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet.apis import multi_gpu_test, single_gpu_test
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.models import build_detector
+from mmdet.utils import (build_ddp, build_dp, compat_cfg, get_device,
+                         replace_cfg_vals, rfnext_init_model,
+                         setup_multi_processes, update_data_root)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold (default: 0.3)')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both '
+            'specified, --options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    cfg = compat_cfg(cfg)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    if 'pretrained' in cfg.model:
+        cfg.model.pretrained = None
+    elif 'init_cfg' in cfg.model.backbone:
+        cfg.model.backbone.init_cfg = None
+
+    if cfg.model.get('neck'):
+        if isinstance(cfg.model.neck, list):
+            for neck_cfg in cfg.model.neck:
+                if neck_cfg.get('rfp_backbone'):
+                    if neck_cfg.rfp_backbone.get('pretrained'):
+                        neck_cfg.rfp_backbone.pretrained = None
+        elif cfg.model.neck.get('rfp_backbone'):
+            if cfg.model.neck.rfp_backbone.get('pretrained'):
+                cfg.model.neck.rfp_backbone.pretrained = None
+
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed testing. Use the first GPU '
+                      'in `gpu_ids` now.')
+    else:
+        cfg.gpu_ids = [args.gpu_id]
+    cfg.device = get_device()
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    test_dataloader_default_args = dict(
+        samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False)
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    test_loader_cfg = {
+        **test_dataloader_default_args,
+        **cfg.data.get('test_dataloader', {})
+    }
+
+    rank, _ = get_dist_info()
+    # allows not to create
+    if args.work_dir is not None and rank == 0:
+        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    # init rfnext if 'RFSearchHook' is defined in cfg
+    rfnext_init_model(model, cfg=cfg)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is None and cfg.get('device', None) == 'npu':
+        fp16_cfg = dict(loss_scale='dynamic')
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+
+    if not distributed:
+        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
+        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
+                                  args.show_score_thr)
+    else:
+        model = build_ddp(
+            model,
+            cfg.device,
+            device_ids=[int(os.environ['LOCAL_RANK'])],
+            broadcast_buffers=False)
+
+        # In multi_gpu_test, if tmpdir is None, some tesnors
+        # will init on cuda by default, and no device choice supported.
+        # Init a tmpdir to avoid error on npu here.
+        if cfg.device == 'npu' and args.tmpdir is None:
+            args.tmpdir = './npu_tmpdir'
+
+        outputs = multi_gpu_test(
+            model, data_loader, args.tmpdir, args.gpu_collect
+            or cfg.evaluation.get('gpu_collect', False))
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(outputs, args.out)
+        kwargs = {} if args.eval_options is None else args.eval_options
+        if args.format_only:
+            dataset.format_results(outputs, **kwargs)
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule', 'dynamic_intervals'
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+            metric = dataset.evaluate(outputs, **eval_kwargs)
+            print(metric)
+            metric_dict = dict(config=args.config, metric=metric)
+            if args.work_dir is not None and rank == 0:
+                mmcv.dump(metric_dict, json_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100755
index 0000000..7ac514b
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,248 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist
+from mmcv.utils import get_git_hash
+
+from mmdet import __version__
+from mmdet.apis import init_random_seed, set_random_seed, train_detector
+from mmdet.datasets import build_dataset
+from mmdet.models import build_detector
+from mmdet.utils import (collect_env, get_device, get_root_logger,
+                         replace_cfg_vals, rfnext_init_model,
+                         setup_multi_processes, update_data_root)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--auto-resume',
+        action='store_true',
+        help='resume from the latest checkpoint automatically')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='(Deprecated, please use --gpu-id) number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--diff-seed',
+        action='store_true',
+        help='Whether or not set different seeds for different ranks')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--auto-scale-lr',
+        action='store_true',
+        help='enable automatically scaling LR.')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.auto_scale_lr:
+        if 'auto_scale_lr' in cfg and \
+                'enable' in cfg.auto_scale_lr and \
+                'base_batch_size' in cfg.auto_scale_lr:
+            cfg.auto_scale_lr.enable = True
+        else:
+            warnings.warn('Can not find "auto_scale_lr" or '
+                          '"auto_scale_lr.enable" or '
+                          '"auto_scale_lr.base_batch_size" in your'
+                          ' configuration file. Please update all the '
+                          'configuration files to mmdet >= 2.24.1.')
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    cfg.auto_resume = args.auto_resume
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    cfg.device = get_device()
+    # set random seeds
+    seed = init_random_seed(args.seed, device=cfg.device)
+    seed = seed + dist.get_rank() if args.diff_seed else seed
+    logger.info(f'Set random seed to {seed}, '
+                f'deterministic: {args.deterministic}')
+    set_random_seed(seed, deterministic=args.deterministic)
+    cfg.seed = seed
+    meta['seed'] = seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_detector(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    # init rfnext if 'RFSearchHook' is defined in cfg
+    rfnext_init_model(model, cfg=cfg)
+
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        assert 'val' in [mode for (mode, _) in cfg.workflow]
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.get(
+            'pipeline', cfg.data.train.dataset.get('pipeline'))
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=__version__ + get_git_hash()[:7],
+            CLASSES=datasets[0].CLASSES)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    model = model.float()
+    train_detector(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()