Spaces:

KyanChen
/

ai-photo-gallery

Runtime error

File size: 44,103 Bytes

f549064

# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Tuple

import torch
import torch.nn as nn
from mmcv.ops import DeformConv2d, MaskedConv2d
from mmengine.model import BaseModule
from mmengine.structures import InstanceData
from torch import Tensor

from mmdet.registry import MODELS, TASK_UTILS
from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
                         OptInstanceList)
from ..layers import multiclass_nms
from ..task_modules.prior_generators import anchor_inside_flags, calc_region
from ..task_modules.samplers import PseudoSampler
from ..utils import images_to_levels, multi_apply, unmap
from .anchor_head import AnchorHead


class FeatureAdaption(BaseModule):
    """Feature Adaption Module.

    Feature Adaption Module is implemented based on DCN v1.
    It uses anchor shape prediction rather than feature map to
    predict offsets of deform conv layer.

    Args:
        in_channels (int): Number of channels in the input feature map.
        out_channels (int): Number of channels in the output feature map.
        kernel_size (int): Deformable conv kernel size. Defaults to 3.
        deform_groups (int): Deformable conv group size. Defaults to 4.
        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
            list[dict], optional): Initialization config dict.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        deform_groups: int = 4,
        init_cfg: MultiConfig = dict(
            type='Normal',
            layer='Conv2d',
            std=0.1,
            override=dict(type='Normal', name='conv_adaption', std=0.01))
    ) -> None:
        super().__init__(init_cfg=init_cfg)
        offset_channels = kernel_size * kernel_size * 2
        self.conv_offset = nn.Conv2d(
            2, deform_groups * offset_channels, 1, bias=False)
        self.conv_adaption = DeformConv2d(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            padding=(kernel_size - 1) // 2,
            deform_groups=deform_groups)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x: Tensor, shape: Tensor) -> Tensor:
        offset = self.conv_offset(shape.detach())
        x = self.relu(self.conv_adaption(x, offset))
        return x


@MODELS.register_module()
class GuidedAnchorHead(AnchorHead):
    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).

    This GuidedAnchorHead will predict high-quality feature guided
    anchors and locations where anchors will be kept in inference.
    There are mainly 3 categories of bounding-boxes.

    - Sampled 9 pairs for target assignment. (approxes)
    - The square boxes where the predicted anchors are based on. (squares)
    - Guided anchors.

    Please refer to https://arxiv.org/abs/1901.03278 for more details.

    Args:
        num_classes (int): Number of classes.
        in_channels (int): Number of channels in the input feature map.
        feat_channels (int): Number of hidden channels. Defaults to 256.
        approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict
            for approx generator
        square_anchor_generator (:obj:`ConfigDict` or dict): Config dict
            for square generator
        anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder
        bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder
        reg_decoded_bbox (bool): If true, the regression loss would be
            applied directly on decoded bounding boxes, converting both
            the predicted boxes and regression targets to absolute
            coordinates format. Defaults to False. It should be `True` when
            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
        deform_groups: (int): Group number of DCN in FeatureAdaption module.
            Defaults to 4.
        loc_filter_thr (float): Threshold to filter out unconcerned regions.
            Defaults to 0.01.
        loss_loc (:obj:`ConfigDict` or dict): Config of location loss.
        loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss.
        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss.
        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
            list[dict], optional): Initialization config dict.
    """

    def __init__(
        self,
        num_classes: int,
        in_channels: int,
        feat_channels: int = 256,
        approx_anchor_generator: ConfigType = dict(
            type='AnchorGenerator',
            octave_base_scale=8,
            scales_per_octave=3,
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        square_anchor_generator: ConfigType = dict(
            type='AnchorGenerator',
            ratios=[1.0],
            scales=[8],
            strides=[4, 8, 16, 32, 64]),
        anchor_coder: ConfigType = dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        bbox_coder: ConfigType = dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        reg_decoded_bbox: bool = False,
        deform_groups: int = 4,
        loc_filter_thr: float = 0.01,
        train_cfg: OptConfigType = None,
        test_cfg: OptConfigType = None,
        loss_loc: ConfigType = dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_shape: ConfigType = dict(
            type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
        loss_cls: ConfigType = dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox: ConfigType = dict(
            type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
        init_cfg: MultiConfig = dict(
            type='Normal',
            layer='Conv2d',
            std=0.01,
            override=dict(
                type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01))
    ) -> None:
        super(AnchorHead, self).__init__(init_cfg=init_cfg)
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.feat_channels = feat_channels
        self.deform_groups = deform_groups
        self.loc_filter_thr = loc_filter_thr

        # build approx_anchor_generator and square_anchor_generator
        assert (approx_anchor_generator['octave_base_scale'] ==
                square_anchor_generator['scales'][0])
        assert (approx_anchor_generator['strides'] ==
                square_anchor_generator['strides'])
        self.approx_anchor_generator = TASK_UTILS.build(
            approx_anchor_generator)
        self.square_anchor_generator = TASK_UTILS.build(
            square_anchor_generator)
        self.approxs_per_octave = self.approx_anchor_generator \
            .num_base_priors[0]

        self.reg_decoded_bbox = reg_decoded_bbox

        # one anchor per location
        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]

        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
        if self.use_sigmoid_cls:
            self.cls_out_channels = self.num_classes
        else:
            self.cls_out_channels = self.num_classes + 1

        # build bbox_coder
        self.anchor_coder = TASK_UTILS.build(anchor_coder)
        self.bbox_coder = TASK_UTILS.build(bbox_coder)

        # build losses
        self.loss_loc = MODELS.build(loss_loc)
        self.loss_shape = MODELS.build(loss_shape)
        self.loss_cls = MODELS.build(loss_cls)
        self.loss_bbox = MODELS.build(loss_bbox)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

        if self.train_cfg:
            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
            # use PseudoSampler when no sampler in train_cfg
            if train_cfg.get('sampler', None) is not None:
                self.sampler = TASK_UTILS.build(
                    self.train_cfg['sampler'], default_args=dict(context=self))
            else:
                self.sampler = PseudoSampler()

            self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner'])
            if train_cfg.get('ga_sampler', None) is not None:
                self.ga_sampler = TASK_UTILS.build(
                    self.train_cfg['ga_sampler'],
                    default_args=dict(context=self))
            else:
                self.ga_sampler = PseudoSampler()

        self._init_layers()

    def _init_layers(self) -> None:
        """Initialize layers of the head."""
        self.relu = nn.ReLU(inplace=True)
        self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
        self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2,
                                    1)
        self.feature_adaption = FeatureAdaption(
            self.in_channels,
            self.feat_channels,
            kernel_size=3,
            deform_groups=self.deform_groups)
        self.conv_cls = MaskedConv2d(
            self.feat_channels, self.num_base_priors * self.cls_out_channels,
            1)
        self.conv_reg = MaskedConv2d(self.feat_channels,
                                     self.num_base_priors * 4, 1)

    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
        """Forward feature of a single scale level."""
        loc_pred = self.conv_loc(x)
        shape_pred = self.conv_shape(x)
        x = self.feature_adaption(x, shape_pred)
        # masked conv is only used during inference for speed-up
        if not self.training:
            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
        else:
            mask = None
        cls_score = self.conv_cls(x, mask)
        bbox_pred = self.conv_reg(x, mask)
        return cls_score, bbox_pred, shape_pred, loc_pred

    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]:
        """Forward features from the upstream network."""
        return multi_apply(self.forward_single, x)

    def get_sampled_approxs(self,
                            featmap_sizes: List[Tuple[int, int]],
                            batch_img_metas: List[dict],
                            device: str = 'cuda') -> tuple:
        """Get sampled approxs and inside flags according to feature map sizes.

        Args:
            featmap_sizes (list[tuple]): Multi-level feature map sizes.
            batch_img_metas (list[dict]): Image meta info.
            device (str): device for returned tensors

        Returns:
            tuple: approxes of each image, inside flags of each image
        """
        num_imgs = len(batch_img_metas)

        # since feature map sizes of all images are the same, we only compute
        # approxes for one time
        multi_level_approxs = self.approx_anchor_generator.grid_priors(
            featmap_sizes, device=device)
        approxs_list = [multi_level_approxs for _ in range(num_imgs)]

        # for each image, we compute inside flags of multi level approxes
        inside_flag_list = []
        for img_id, img_meta in enumerate(batch_img_metas):
            multi_level_flags = []
            multi_level_approxs = approxs_list[img_id]

            # obtain valid flags for each approx first
            multi_level_approx_flags = self.approx_anchor_generator \
                .valid_flags(featmap_sizes,
                             img_meta['pad_shape'],
                             device=device)

            for i, flags in enumerate(multi_level_approx_flags):
                approxs = multi_level_approxs[i]
                inside_flags_list = []
                for j in range(self.approxs_per_octave):
                    split_valid_flags = flags[j::self.approxs_per_octave]
                    split_approxs = approxs[j::self.approxs_per_octave, :]
                    inside_flags = anchor_inside_flags(
                        split_approxs, split_valid_flags,
                        img_meta['img_shape'][:2],
                        self.train_cfg['allowed_border'])
                    inside_flags_list.append(inside_flags)
                # inside_flag for a position is true if any anchor in this
                # position is true
                inside_flags = (
                    torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
                multi_level_flags.append(inside_flags)
            inside_flag_list.append(multi_level_flags)
        return approxs_list, inside_flag_list

    def get_anchors(self,
                    featmap_sizes: List[Tuple[int, int]],
                    shape_preds: List[Tensor],
                    loc_preds: List[Tensor],
                    batch_img_metas: List[dict],
                    use_loc_filter: bool = False,
                    device: str = 'cuda') -> tuple:
        """Get squares according to feature map sizes and guided anchors.

        Args:
            featmap_sizes (list[tuple]): Multi-level feature map sizes.
            shape_preds (list[tensor]): Multi-level shape predictions.
            loc_preds (list[tensor]): Multi-level location predictions.
            batch_img_metas (list[dict]): Image meta info.
            use_loc_filter (bool): Use loc filter or not. Defaults to False
            device (str): device for returned tensors.
                Defaults to `cuda`.

        Returns:
            tuple: square approxs of each image, guided anchors of each image,
            loc masks of each image.
        """
        num_imgs = len(batch_img_metas)
        num_levels = len(featmap_sizes)

        # since feature map sizes of all images are the same, we only compute
        # squares for one time
        multi_level_squares = self.square_anchor_generator.grid_priors(
            featmap_sizes, device=device)
        squares_list = [multi_level_squares for _ in range(num_imgs)]

        # for each image, we compute multi level guided anchors
        guided_anchors_list = []
        loc_mask_list = []
        for img_id, img_meta in enumerate(batch_img_metas):
            multi_level_guided_anchors = []
            multi_level_loc_mask = []
            for i in range(num_levels):
                squares = squares_list[img_id][i]
                shape_pred = shape_preds[i][img_id]
                loc_pred = loc_preds[i][img_id]
                guided_anchors, loc_mask = self._get_guided_anchors_single(
                    squares,
                    shape_pred,
                    loc_pred,
                    use_loc_filter=use_loc_filter)
                multi_level_guided_anchors.append(guided_anchors)
                multi_level_loc_mask.append(loc_mask)
            guided_anchors_list.append(multi_level_guided_anchors)
            loc_mask_list.append(multi_level_loc_mask)
        return squares_list, guided_anchors_list, loc_mask_list

    def _get_guided_anchors_single(
            self,
            squares: Tensor,
            shape_pred: Tensor,
            loc_pred: Tensor,
            use_loc_filter: bool = False) -> Tuple[Tensor]:
        """Get guided anchors and loc masks for a single level.

        Args:
            squares (tensor): Squares of a single level.
            shape_pred (tensor): Shape predictions of a single level.
            loc_pred (tensor): Loc predictions of a single level.
            use_loc_filter (list[tensor]): Use loc filter or not.
                Defaults to False.

        Returns:
            tuple: guided anchors, location masks
        """
        # calculate location filtering mask
        loc_pred = loc_pred.sigmoid().detach()
        if use_loc_filter:
            loc_mask = loc_pred >= self.loc_filter_thr
        else:
            loc_mask = loc_pred >= 0.0
        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors)
        mask = mask.contiguous().view(-1)
        # calculate guided anchors
        squares = squares[mask]
        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
            -1, 2).detach()[mask]
        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
        bbox_deltas[:, 2:] = anchor_deltas
        guided_anchors = self.anchor_coder.decode(
            squares, bbox_deltas, wh_ratio_clip=1e-6)
        return guided_anchors, mask

    def ga_loc_targets(self, batch_gt_instances: InstanceList,
                       featmap_sizes: List[Tuple[int, int]]) -> tuple:
        """Compute location targets for guided anchoring.

        Each feature map is divided into positive, negative and ignore regions.
        - positive regions: target 1, weight 1
        - ignore regions: target 0, weight 0
        - negative regions: target 0, weight 0.1

        Args:
            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
                gt_instance. It usually includes ``bboxes`` and ``labels``
                attributes.
            featmap_sizes (list[tuple]): Multi level sizes of each feature
                maps.

        Returns:
            tuple: Returns a tuple containing location targets.
        """
        anchor_scale = self.approx_anchor_generator.octave_base_scale
        anchor_strides = self.approx_anchor_generator.strides
        # Currently only supports same stride in x and y direction.
        for stride in anchor_strides:
            assert (stride[0] == stride[1])
        anchor_strides = [stride[0] for stride in anchor_strides]

        center_ratio = self.train_cfg['center_ratio']
        ignore_ratio = self.train_cfg['ignore_ratio']
        img_per_gpu = len(batch_gt_instances)
        num_lvls = len(featmap_sizes)
        r1 = (1 - center_ratio) / 2
        r2 = (1 - ignore_ratio) / 2
        all_loc_targets = []
        all_loc_weights = []
        all_ignore_map = []
        for lvl_id in range(num_lvls):
            h, w = featmap_sizes[lvl_id]
            loc_targets = torch.zeros(
                img_per_gpu,
                1,
                h,
                w,
                device=batch_gt_instances[0].bboxes.device,
                dtype=torch.float32)
            loc_weights = torch.full_like(loc_targets, -1)
            ignore_map = torch.zeros_like(loc_targets)
            all_loc_targets.append(loc_targets)
            all_loc_weights.append(loc_weights)
            all_ignore_map.append(ignore_map)
        for img_id in range(img_per_gpu):
            gt_bboxes = batch_gt_instances[img_id].bboxes
            scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
                               (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
            min_anchor_size = scale.new_full(
                (1, ), float(anchor_scale * anchor_strides[0]))
            # assign gt bboxes to different feature levels w.r.t. their scales
            target_lvls = torch.floor(
                torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
            target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
            for gt_id in range(gt_bboxes.size(0)):
                lvl = target_lvls[gt_id].item()
                # rescaled to corresponding feature map
                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
                # calculate ignore regions
                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
                    gt_, r2, featmap_sizes[lvl])
                # calculate positive (center) regions
                ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
                    gt_, r1, featmap_sizes[lvl])
                all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
                                     ctr_x1:ctr_x2 + 1] = 1
                all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
                                     ignore_x1:ignore_x2 + 1] = 0
                all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
                                     ctr_x1:ctr_x2 + 1] = 1
                # calculate ignore map on nearby low level feature
                if lvl > 0:
                    d_lvl = lvl - 1
                    # rescaled to corresponding feature map
                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
                        gt_, r2, featmap_sizes[d_lvl])
                    all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
                                          ignore_x1:ignore_x2 + 1] = 1
                # calculate ignore map on nearby high level feature
                if lvl < num_lvls - 1:
                    u_lvl = lvl + 1
                    # rescaled to corresponding feature map
                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
                        gt_, r2, featmap_sizes[u_lvl])
                    all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
                                          ignore_x1:ignore_x2 + 1] = 1
        for lvl_id in range(num_lvls):
            # ignore negative regions w.r.t. ignore map
            all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
                                    & (all_ignore_map[lvl_id] > 0)] = 0
            # set negative regions with weight 0.1
            all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
        # loc average factor to balance loss
        loc_avg_factor = sum(
            [t.size(0) * t.size(-1) * t.size(-2)
             for t in all_loc_targets]) / 200
        return all_loc_targets, all_loc_weights, loc_avg_factor

    def _ga_shape_target_single(self,
                                flat_approxs: Tensor,
                                inside_flags: Tensor,
                                flat_squares: Tensor,
                                gt_instances: InstanceData,
                                gt_instances_ignore: Optional[InstanceData],
                                img_meta: dict,
                                unmap_outputs: bool = True) -> tuple:
        """Compute guided anchoring targets.

        This function returns sampled anchors and gt bboxes directly
        rather than calculates regression targets.

        Args:
            flat_approxs (Tensor): flat approxs of a single image,
                shape (n, 4)
            inside_flags (Tensor): inside flags of a single image,
                shape (n, ).
            flat_squares (Tensor): flat squares of a single image,
                shape (approxs_per_octave * n, 4)
            gt_instances (:obj:`InstanceData`): Ground truth of instance
                annotations. It usually includes ``bboxes`` and ``labels``
                attributes.
            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
                to be ignored during training. It includes ``bboxes`` attribute
                data that is ignored during training and testing.
            img_meta (dict): Meta info of a single image.
            unmap_outputs (bool): unmap outputs or not.

        Returns:
            tuple: Returns a tuple containing shape targets of each image.
        """
        if not inside_flags.any():
            raise ValueError(
                'There is no valid anchor inside the image boundary. Please '
                'check the image size and anchor sizes, or set '
                '``allowed_border`` to -1 to skip the condition.')
        # assign gt and sample anchors
        num_square = flat_squares.size(0)
        approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
        approxs = approxs[inside_flags, ...]
        squares = flat_squares[inside_flags, :]

        pred_instances = InstanceData()
        pred_instances.priors = squares
        pred_instances.approxs = approxs

        assign_result = self.ga_assigner.assign(
            pred_instances=pred_instances,
            gt_instances=gt_instances,
            gt_instances_ignore=gt_instances_ignore)
        sampling_result = self.ga_sampler.sample(
            assign_result=assign_result,
            pred_instances=pred_instances,
            gt_instances=gt_instances)

        bbox_anchors = torch.zeros_like(squares)
        bbox_gts = torch.zeros_like(squares)
        bbox_weights = torch.zeros_like(squares)

        pos_inds = sampling_result.pos_inds
        neg_inds = sampling_result.neg_inds
        if len(pos_inds) > 0:
            bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
            bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
            bbox_weights[pos_inds, :] = 1.0

        # map up to original set of anchors
        if unmap_outputs:
            num_total_anchors = flat_squares.size(0)
            bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
            bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)

        return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds,
                sampling_result)

    def ga_shape_targets(self,
                         approx_list: List[List[Tensor]],
                         inside_flag_list: List[List[Tensor]],
                         square_list: List[List[Tensor]],
                         batch_gt_instances: InstanceList,
                         batch_img_metas: List[dict],
                         batch_gt_instances_ignore: OptInstanceList = None,
                         unmap_outputs: bool = True) -> tuple:
        """Compute guided anchoring targets.

        Args:
            approx_list (list[list[Tensor]]): Multi level approxs of each
                image.
            inside_flag_list (list[list[Tensor]]): Multi level inside flags
                of each image.
            square_list (list[list[Tensor]]): Multi level squares of each
                image.
            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
                gt_instance. It usually includes ``bboxes`` and ``labels``
                attributes.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
                data that is ignored during training and testing.
                Defaults to None.
            unmap_outputs (bool): unmap outputs or not. Defaults to None.

        Returns:
            tuple:  Returns a tuple containing shape targets.
        """
        num_imgs = len(batch_img_metas)
        assert len(approx_list) == len(inside_flag_list) == len(
            square_list) == num_imgs
        # anchor number of multi levels
        num_level_squares = [squares.size(0) for squares in square_list[0]]
        # concat all level anchors and flags to a single tensor
        inside_flag_flat_list = []
        approx_flat_list = []
        square_flat_list = []
        for i in range(num_imgs):
            assert len(square_list[i]) == len(inside_flag_list[i])
            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
            approx_flat_list.append(torch.cat(approx_list[i]))
            square_flat_list.append(torch.cat(square_list[i]))

        # compute targets for each image
        if batch_gt_instances_ignore is None:
            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
        (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
         neg_inds_list, sampling_results_list) = multi_apply(
             self._ga_shape_target_single,
             approx_flat_list,
             inside_flag_flat_list,
             square_flat_list,
             batch_gt_instances,
             batch_gt_instances_ignore,
             batch_img_metas,
             unmap_outputs=unmap_outputs)
        # sampled anchors of all images
        avg_factor = sum(
            [results.avg_factor for results in sampling_results_list])
        # split targets to a list w.r.t. multiple levels
        bbox_anchors_list = images_to_levels(all_bbox_anchors,
                                             num_level_squares)
        bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
        bbox_weights_list = images_to_levels(all_bbox_weights,
                                             num_level_squares)
        return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
                avg_factor)

    def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor,
                          bbox_gts: Tensor, anchor_weights: Tensor,
                          avg_factor: int) -> Tensor:
        """Compute shape loss in single level."""
        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
        bbox_gts = bbox_gts.contiguous().view(-1, 4)
        anchor_weights = anchor_weights.contiguous().view(-1, 4)
        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
        bbox_deltas[:, 2:] += shape_pred
        # filter out negative samples to speed-up weighted_bounded_iou_loss
        inds = torch.nonzero(
            anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
        bbox_deltas_ = bbox_deltas[inds]
        bbox_anchors_ = bbox_anchors[inds]
        bbox_gts_ = bbox_gts[inds]
        anchor_weights_ = anchor_weights[inds]
        pred_anchors_ = self.anchor_coder.decode(
            bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
        loss_shape = self.loss_shape(
            pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor)
        return loss_shape

    def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor,
                        loc_weight: Tensor, avg_factor: float) -> Tensor:
        """Compute location loss in single level."""
        loss_loc = self.loss_loc(
            loc_pred.reshape(-1, 1),
            loc_target.reshape(-1).long(),
            loc_weight.reshape(-1),
            avg_factor=avg_factor)
        return loss_loc

    def loss_by_feat(
            self,
            cls_scores: List[Tensor],
            bbox_preds: List[Tensor],
            shape_preds: List[Tensor],
            loc_preds: List[Tensor],
            batch_gt_instances: InstanceList,
            batch_img_metas: List[dict],
            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
        """Calculate the loss based on the features extracted by the detection
        head.

        Args:
            cls_scores (list[Tensor]): Box scores for each scale level
                has shape (N, num_anchors * num_classes, H, W).
            bbox_preds (list[Tensor]): Box energies / deltas for each scale
                level with shape (N, num_anchors * 4, H, W).
            shape_preds (list[Tensor]): shape predictions for each scale
                level with shape (N, 1, H, W).
            loc_preds (list[Tensor]): location predictions for each scale
                level with shape (N, num_anchors * 2, H, W).
            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
                gt_instance. It usually includes ``bboxes`` and ``labels``
                attributes.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
                data that is ignored during training and testing.
                Defaults to None.

        Returns:
            dict: A dictionary of loss components.
        """

        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels

        device = cls_scores[0].device

        # get loc targets
        loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
            batch_gt_instances, featmap_sizes)

        # get sampled approxes
        approxs_list, inside_flag_list = self.get_sampled_approxs(
            featmap_sizes, batch_img_metas, device=device)
        # get squares and guided anchors
        squares_list, guided_anchors_list, _ = self.get_anchors(
            featmap_sizes,
            shape_preds,
            loc_preds,
            batch_img_metas,
            device=device)

        # get shape targets
        shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
                                              squares_list, batch_gt_instances,
                                              batch_img_metas)
        (bbox_anchors_list, bbox_gts_list, anchor_weights_list,
         ga_avg_factor) = shape_targets

        # get anchor targets
        cls_reg_targets = self.get_targets(
            guided_anchors_list,
            inside_flag_list,
            batch_gt_instances,
            batch_img_metas,
            batch_gt_instances_ignore=batch_gt_instances_ignore)
        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
         avg_factor) = cls_reg_targets

        # anchor number of multi levels
        num_level_anchors = [
            anchors.size(0) for anchors in guided_anchors_list[0]
        ]
        # concat all level anchors to a single tensor
        concat_anchor_list = []
        for i in range(len(guided_anchors_list)):
            concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
        all_anchor_list = images_to_levels(concat_anchor_list,
                                           num_level_anchors)

        # get classification and bbox regression losses
        losses_cls, losses_bbox = multi_apply(
            self.loss_by_feat_single,
            cls_scores,
            bbox_preds,
            all_anchor_list,
            labels_list,
            label_weights_list,
            bbox_targets_list,
            bbox_weights_list,
            avg_factor=avg_factor)

        # get anchor location loss
        losses_loc = []
        for i in range(len(loc_preds)):
            loss_loc = self.loss_loc_single(
                loc_preds[i],
                loc_targets[i],
                loc_weights[i],
                avg_factor=loc_avg_factor)
            losses_loc.append(loss_loc)

        # get anchor shape loss
        losses_shape = []
        for i in range(len(shape_preds)):
            loss_shape = self.loss_shape_single(
                shape_preds[i],
                bbox_anchors_list[i],
                bbox_gts_list[i],
                anchor_weights_list[i],
                avg_factor=ga_avg_factor)
            losses_shape.append(loss_shape)

        return dict(
            loss_cls=losses_cls,
            loss_bbox=losses_bbox,
            loss_shape=losses_shape,
            loss_loc=losses_loc)

    def predict_by_feat(self,
                        cls_scores: List[Tensor],
                        bbox_preds: List[Tensor],
                        shape_preds: List[Tensor],
                        loc_preds: List[Tensor],
                        batch_img_metas: List[dict],
                        cfg: OptConfigType = None,
                        rescale: bool = False) -> InstanceList:
        """Transform a batch of output features extracted from the head into
        bbox results.

        Args:
            cls_scores (list[Tensor]): Classification scores for all
                scale levels, each is a 4D-tensor, has shape
                (batch_size, num_priors * num_classes, H, W).
            bbox_preds (list[Tensor]): Box energies / deltas for all
                scale levels, each is a 4D-tensor, has shape
                (batch_size, num_priors * 4, H, W).
            shape_preds (list[Tensor]): shape predictions for each scale
                level with shape (N, 1, H, W).
            loc_preds (list[Tensor]): location predictions for each scale
                level with shape (N, num_anchors * 2, H, W).
            batch_img_metas (list[dict], Optional): Batch image meta info.
                Defaults to None.
            cfg (ConfigDict, optional): Test / postprocessing
                configuration, if None, test_cfg would be used.
                Defaults to None.
            rescale (bool): If True, return boxes in original image space.
                Defaults to False.

        Returns:
            list[:obj:`InstanceData`]: Object detection results of each image
            after the post process. Each item usually contains following keys.

            - scores (Tensor): Classification scores, has a shape
              (num_instance, )
            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
            - bboxes (Tensor): Has a shape (num_instances, 4), the last
              dimension 4 arrange as (x1, y1, x2, y2).
        """
        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
            loc_preds)
        num_levels = len(cls_scores)
        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
        device = cls_scores[0].device
        # get guided anchors
        _, guided_anchors, loc_masks = self.get_anchors(
            featmap_sizes,
            shape_preds,
            loc_preds,
            batch_img_metas,
            use_loc_filter=not self.training,
            device=device)
        result_list = []
        for img_id in range(len(batch_img_metas)):
            cls_score_list = [
                cls_scores[i][img_id].detach() for i in range(num_levels)
            ]
            bbox_pred_list = [
                bbox_preds[i][img_id].detach() for i in range(num_levels)
            ]
            guided_anchor_list = [
                guided_anchors[img_id][i].detach() for i in range(num_levels)
            ]
            loc_mask_list = [
                loc_masks[img_id][i].detach() for i in range(num_levels)
            ]
            proposals = self._predict_by_feat_single(
                cls_scores=cls_score_list,
                bbox_preds=bbox_pred_list,
                mlvl_anchors=guided_anchor_list,
                mlvl_masks=loc_mask_list,
                img_meta=batch_img_metas[img_id],
                cfg=cfg,
                rescale=rescale)
            result_list.append(proposals)
        return result_list

    def _predict_by_feat_single(self,
                                cls_scores: List[Tensor],
                                bbox_preds: List[Tensor],
                                mlvl_anchors: List[Tensor],
                                mlvl_masks: List[Tensor],
                                img_meta: dict,
                                cfg: ConfigType,
                                rescale: bool = False) -> InstanceData:
        """Transform a single image's features extracted from the head into
        bbox results.

        Args:
            cls_scores (list[Tensor]): Box scores from all scale
                levels of a single image, each item has shape
                (num_priors * num_classes, H, W).
            bbox_preds (list[Tensor]): Box energies / deltas from
                all scale levels of a single image, each item has shape
                (num_priors * 4, H, W).
            mlvl_anchors (list[Tensor]): Each element in the list is
                the anchors of a single level in feature pyramid. it has
                shape (num_priors, 4).
            mlvl_masks (list[Tensor]): Each element in the list is location
                masks of a single level.
            img_meta (dict): Image meta info.
            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
                configuration, if None, test_cfg would be used.
            rescale (bool): If True, return boxes in original image space.
                Defaults to False.

        Returns:
            :obj:`InstanceData`: Detection results of each image
            after the post process.
            Each item usually contains following keys.

            - scores (Tensor): Classification scores, has a shape
              (num_instance, )
            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
            - bboxes (Tensor): Has a shape (num_instances, 4), the last
              dimension 4 arrange as (x1, y1, x2, y2).
        """
        cfg = self.test_cfg if cfg is None else cfg
        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
        mlvl_bbox_preds = []
        mlvl_valid_anchors = []
        mlvl_scores = []
        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
                                                       mlvl_anchors,
                                                       mlvl_masks):
            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
            # if no location is kept, end.
            if mask.sum() == 0:
                continue
            # reshape scores and bbox_pred
            cls_score = cls_score.permute(1, 2,
                                          0).reshape(-1, self.cls_out_channels)
            if self.use_sigmoid_cls:
                scores = cls_score.sigmoid()
            else:
                scores = cls_score.softmax(-1)
            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
            # filter scores, bbox_pred w.r.t. mask.
            # anchors are filtered in get_anchors() beforehand.
            scores = scores[mask, :]
            bbox_pred = bbox_pred[mask, :]
            if scores.dim() == 0:
                anchors = anchors.unsqueeze(0)
                scores = scores.unsqueeze(0)
                bbox_pred = bbox_pred.unsqueeze(0)
            # filter anchors, bbox_pred, scores w.r.t. scores
            nms_pre = cfg.get('nms_pre', -1)
            if nms_pre > 0 and scores.shape[0] > nms_pre:
                if self.use_sigmoid_cls:
                    max_scores, _ = scores.max(dim=1)
                else:
                    # remind that we set FG labels to [0, num_class-1]
                    # since mmdet v2.0
                    # BG cat_id: num_class
                    max_scores, _ = scores[:, :-1].max(dim=1)
                _, topk_inds = max_scores.topk(nms_pre)
                anchors = anchors[topk_inds, :]
                bbox_pred = bbox_pred[topk_inds, :]
                scores = scores[topk_inds, :]

            mlvl_bbox_preds.append(bbox_pred)
            mlvl_valid_anchors.append(anchors)
            mlvl_scores.append(scores)

        mlvl_bbox_preds = torch.cat(mlvl_bbox_preds)
        mlvl_anchors = torch.cat(mlvl_valid_anchors)
        mlvl_scores = torch.cat(mlvl_scores)
        mlvl_bboxes = self.bbox_coder.decode(
            mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape'])

        if rescale:
            assert img_meta.get('scale_factor') is not None
            mlvl_bboxes /= mlvl_bboxes.new_tensor(
                img_meta['scale_factor']).repeat((1, 2))

        if self.use_sigmoid_cls:
            # Add a dummy background class to the backend when using sigmoid
            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
            # BG cat_id: num_class
            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
        # multi class NMS
        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
                                                cfg.score_thr, cfg.nms,
                                                cfg.max_per_img)

        results = InstanceData()
        results.bboxes = det_bboxes[:, :-1]
        results.scores = det_bboxes[:, -1]
        results.labels = det_labels
        return results