Spaces:

sadimanna
/

fasterrcnn-project-demo

Paused

App Files Files Community

sadimanna commited on Sep 13, 2023

Commit

d6def08

1 Parent(s): 451fc17

Upload 20 files

Browse files

Files changed (20) hide show

backbone/base.py +29 -0
backbone/resnet101.py +37 -0
backbone/resnet18.py +37 -0
backbone/resnet50.py +37 -0
config/config.py +37 -0
config/eval_config.py +20 -0
config/train_config.py +71 -0
dataset/base.py +155 -0
dataset/coco2017.py +212 -0
dataset/coco2017_animal.py +205 -0
dataset/coco2017_car.py +201 -0
dataset/coco2017_person.py +201 -0
dataset/voc2007.py +168 -0
dataset/voc2007_cat_dog.py +171 -0
extension/functional.py +10 -0
extension/lr_scheduler.py +23 -0
models/MobileNetSSD_deploy.caffemodel +3 -0
models/MobileNetSSD_deploy.prototxt.txt +1912 -0
roi/pooler.py +45 -0
rpn/region_proposal_network.py +169 -0

backbone/base.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Tuple, Type
+from torch import nn
+class Base(object):
+    OPTIONS = ['resnet18', 'resnet50', 'resnet101']
+    @staticmethod
+    def from_name(name: str) -> Type['Base']:
+        if name == 'resnet18':
+            from backbone.resnet18 import ResNet18
+            return ResNet18
+        elif name == 'resnet50':
+            from backbone.resnet50 import ResNet50
+            return ResNet50
+        elif name == 'resnet101':
+            from backbone.resnet101 import ResNet101
+            return ResNet101
+        else:
+            raise ValueError
+    def __init__(self, pretrained: bool):
+        super().__init__()
+        self._pretrained = pretrained
+    def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
+        raise NotImplementedError

backbone/resnet101.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from typing import Tuple
+import torchvision
+from torch import nn
+import backbone.base
+class ResNet101(backbone.base.Base):
+    def __init__(self, pretrained: bool):
+        super().__init__(pretrained)
+    def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
+        resnet101 = torchvision.models.resnet101(pretrained=self._pretrained)
+        # list(resnet101.children()) consists of following modules
+        #   [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU,
+        #   [3] = MaxPool2d, [4] = Sequential(Bottleneck...),
+        #   [5] = Sequential(Bottleneck...),
+        #   [6] = Sequential(Bottleneck...),
+        #   [7] = Sequential(Bottleneck...),
+        #   [8] = AvgPool2d, [9] = Linear
+        children = list(resnet101.children())
+        features = children[:-3]
+        num_features_out = 1024
+        hidden = children[-3]
+        num_hidden_out = 2048
+        for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]:
+            for parameter in parameters:
+                parameter.requires_grad = False
+        features = nn.Sequential(*features)
+        return features, hidden, num_features_out, num_hidden_out

backbone/resnet18.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from typing import Tuple
+import torchvision
+from torch import nn
+import backbone.base
+class ResNet18(backbone.base.Base):
+    def __init__(self, pretrained: bool):
+        super().__init__(pretrained)
+    def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
+        resnet18 = torchvision.models.resnet18(pretrained=self._pretrained)
+        # list(resnet18.children()) consists of following modules
+        #   [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU,
+        #   [3] = MaxPool2d, [4] = Sequential(Bottleneck...),
+        #   [5] = Sequential(Bottleneck...),
+        #   [6] = Sequential(Bottleneck...),
+        #   [7] = Sequential(Bottleneck...),
+        #   [8] = AvgPool2d, [9] = Linear
+        children = list(resnet18.children())
+        features = children[:-3]
+        num_features_out = 256
+        hidden = children[-3]
+        num_hidden_out = 512
+        for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]:
+            for parameter in parameters:
+                parameter.requires_grad = False
+        features = nn.Sequential(*features)
+        return features, hidden, num_features_out, num_hidden_out

backbone/resnet50.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from typing import Tuple
+import torchvision
+from torch import nn
+import backbone.base
+class ResNet50(backbone.base.Base):
+    def __init__(self, pretrained: bool):
+        super().__init__(pretrained)
+    def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
+        resnet50 = torchvision.models.resnet50(pretrained=self._pretrained)
+        # list(resnet50.children()) consists of following modules
+        #   [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU,
+        #   [3] = MaxPool2d, [4] = Sequential(Bottleneck...),
+        #   [5] = Sequential(Bottleneck...),
+        #   [6] = Sequential(Bottleneck...),
+        #   [7] = Sequential(Bottleneck...),
+        #   [8] = AvgPool2d, [9] = Linear
+        children = list(resnet50.children())
+        features = children[:-3]
+        num_features_out = 1024
+        hidden = children[-3]
+        num_hidden_out = 2048
+        for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]:
+            for parameter in parameters:
+                parameter.requires_grad = False
+        features = nn.Sequential(*features)
+        return features, hidden, num_features_out, num_hidden_out

config/config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import ast
+from typing import Tuple, List
+from roi.pooler import Pooler
+class Config(object):
+    IMAGE_MIN_SIDE: float = 600.0
+    IMAGE_MAX_SIDE: float = 1000.0
+    ANCHOR_RATIOS: List[Tuple[int, int]] = [(1, 2), (1, 1), (2, 1)]
+    ANCHOR_SIZES: List[int] = [128, 256, 512]
+    POOLER_MODE: Pooler.Mode = Pooler.Mode.POOLING
+    @classmethod
+    def describe(cls):
+        text = '\nConfig:\n'
+        attrs = [attr for attr in dir(cls) if not callable(getattr(cls, attr)) and not attr.startswith('__')]
+        text += '\n'.join(['\t{:s} = {:s}'.format(attr, str(getattr(cls, attr))) for attr in attrs]) + '\n'
+        return text
+    @classmethod
+    def setup(cls, image_min_side: float = None, image_max_side: float = None,
+              anchor_ratios: List[Tuple[int, int]] = None, anchor_sizes: List[int] = None, pooler_mode: str = None):
+        if image_min_side is not None:
+            cls.IMAGE_MIN_SIDE = image_min_side
+        if image_max_side is not None:
+            cls.IMAGE_MAX_SIDE = image_max_side
+        if anchor_ratios is not None:
+            cls.ANCHOR_RATIOS = ast.literal_eval(anchor_ratios)
+        if anchor_sizes is not None:
+            cls.ANCHOR_SIZES = ast.literal_eval(anchor_sizes)
+        if pooler_mode is not None:
+            cls.POOLER_MODE = Pooler.Mode(pooler_mode)

config/eval_config.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import List, Tuple
+from config.config import Config
+class EvalConfig(Config):
+    RPN_PRE_NMS_TOP_N: int = 6000
+    RPN_POST_NMS_TOP_N: int = 300
+    @classmethod
+    def setup(cls, image_min_side: float = None, image_max_side: float = None,
+              anchor_ratios: List[Tuple[int, int]] = None, anchor_sizes: List[int] = None, pooler_mode: str = None,
+              rpn_pre_nms_top_n: int = None, rpn_post_nms_top_n: int = None):
+        super().setup(image_min_side, image_max_side, anchor_ratios, anchor_sizes, pooler_mode)
+        if rpn_pre_nms_top_n is not None:
+            cls.RPN_PRE_NMS_TOP_N = rpn_pre_nms_top_n
+        if rpn_post_nms_top_n is not None:
+            cls.RPN_POST_NMS_TOP_N = rpn_post_nms_top_n

config/train_config.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import ast
+from typing import List, Tuple
+from config.config import Config
+class TrainConfig(Config):
+    RPN_PRE_NMS_TOP_N: int = 12000
+    RPN_POST_NMS_TOP_N: int = 2000
+    ANCHOR_SMOOTH_L1_LOSS_BETA: float = 1.0
+    PROPOSAL_SMOOTH_L1_LOSS_BETA: float = 1.0
+    BATCH_SIZE: int = 1
+    LEARNING_RATE: float = 0.001
+    MOMENTUM: float = 0.9
+    WEIGHT_DECAY: float = 0.0005
+    STEP_LR_SIZES: List[int] = [50000, 70000]
+    STEP_LR_GAMMA: float = 0.1
+    WARM_UP_FACTOR: float = 0.3333
+    WARM_UP_NUM_ITERS: int = 500
+    NUM_STEPS_TO_DISPLAY: int = 20
+    NUM_STEPS_TO_SNAPSHOT: int = 10000
+    NUM_STEPS_TO_FINISH: int = 90000
+    @classmethod
+    def setup(cls, image_min_side: float = None, image_max_side: float = None,
+              anchor_ratios: List[Tuple[int, int]] = None, anchor_sizes: List[int] = None, pooler_mode: str = None,
+              rpn_pre_nms_top_n: int = None, rpn_post_nms_top_n: int = None,
+              anchor_smooth_l1_loss_beta: float = None, proposal_smooth_l1_loss_beta: float = None,
+              batch_size: int = None, learning_rate: float = None, momentum: float = None, weight_decay: float = None,
+              step_lr_sizes: List[int] = None, step_lr_gamma: float = None,
+              warm_up_factor: float = None, warm_up_num_iters: int = None,
+              num_steps_to_display: int = None, num_steps_to_snapshot: int = None, num_steps_to_finish: int = None):
+        super().setup(image_min_side, image_max_side, anchor_ratios, anchor_sizes, pooler_mode)
+        if rpn_pre_nms_top_n is not None:
+            cls.RPN_PRE_NMS_TOP_N = rpn_pre_nms_top_n
+        if rpn_post_nms_top_n is not None:
+            cls.RPN_POST_NMS_TOP_N = rpn_post_nms_top_n
+        if anchor_smooth_l1_loss_beta is not None:
+            cls.ANCHOR_SMOOTH_L1_LOSS_BETA = anchor_smooth_l1_loss_beta
+        if proposal_smooth_l1_loss_beta is not None:
+            cls.PROPOSAL_SMOOTH_L1_LOSS_BETA = proposal_smooth_l1_loss_beta
+        if batch_size is not None:
+            cls.BATCH_SIZE = batch_size
+        if learning_rate is not None:
+            cls.LEARNING_RATE = learning_rate
+        if momentum is not None:
+            cls.MOMENTUM = momentum
+        if weight_decay is not None:
+            cls.WEIGHT_DECAY = weight_decay
+        if step_lr_sizes is not None:
+            cls.STEP_LR_SIZES = ast.literal_eval(step_lr_sizes)
+        if step_lr_gamma is not None:
+            cls.STEP_LR_GAMMA = step_lr_gamma
+        if warm_up_factor is not None:
+            cls.WARM_UP_FACTOR = warm_up_factor
+        if warm_up_num_iters is not None:
+            cls.WARM_UP_NUM_ITERS = warm_up_num_iters
+        if num_steps_to_display is not None:
+            cls.NUM_STEPS_TO_DISPLAY = num_steps_to_display
+        if num_steps_to_snapshot is not None:
+            cls.NUM_STEPS_TO_SNAPSHOT = num_steps_to_snapshot
+        if num_steps_to_finish is not None:
+            cls.NUM_STEPS_TO_FINISH = num_steps_to_finish

dataset/base.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import random
+from enum import Enum
+from typing import Tuple, List, Type, Iterator
+import PIL
+import torch.utils.data.dataset
+import torch.utils.data.sampler
+from PIL import Image
+from torch import Tensor
+from torch.nn import functional as F
+from torchvision.transforms import transforms
+class Base(torch.utils.data.dataset.Dataset):
+    class Mode(Enum):
+        TRAIN = 'train'
+        EVAL = 'eval'
+    OPTIONS = ['voc2007', 'coco2017', 'voc2007-cat-dog', 'coco2017-person', 'coco2017-car', 'coco2017-animal']
+    @staticmethod
+    def from_name(name: str) -> Type['Base']:
+        if name == 'voc2007':
+            from dataset.voc2007 import VOC2007
+            return VOC2007
+        elif name == 'coco2017':
+            from dataset.coco2017 import COCO2017
+            return COCO2017
+        elif name == 'voc2007-cat-dog':
+            from dataset.voc2007_cat_dog import VOC2007CatDog
+            return VOC2007CatDog
+        elif name == 'coco2017-person':
+            from dataset.coco2017_person import COCO2017Person
+            return COCO2017Person
+        elif name == 'coco2017-car':
+            from dataset.coco2017_car import COCO2017Car
+            return COCO2017Car
+        elif name == 'coco2017-animal':
+            from dataset.coco2017_animal import COCO2017Animal
+            return COCO2017Animal
+        else:
+            raise ValueError
+    def __init__(self, path_to_data_dir: str, mode: Mode, image_min_side: float, image_max_side: float):
+        self._path_to_data_dir = path_to_data_dir
+        self._mode = mode
+        self._image_min_side = image_min_side
+        self._image_max_side = image_max_side
+    def __len__(self) -> int:
+        raise NotImplementedError
+    def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
+        raise NotImplementedError
+    def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
+        raise NotImplementedError
+    def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
+        raise NotImplementedError
+    @property
+    def image_ratios(self) -> List[float]:
+        raise NotImplementedError
+    @staticmethod
+    def num_classes() -> int:
+        raise NotImplementedError
+    @staticmethod
+    def preprocess(image: PIL.Image.Image, image_min_side: float, image_max_side: float) -> Tuple[Tensor, float]:
+        # resize according to the rules:
+        #   1. scale shorter side to IMAGE_MIN_SIDE
+        #   2. after scaling, if longer side > IMAGE_MAX_SIDE, scale longer side to IMAGE_MAX_SIDE
+        scale_for_shorter_side = image_min_side / min(image.width, image.height)
+        longer_side_after_scaling = max(image.width, image.height) * scale_for_shorter_side
+        scale_for_longer_side = (image_max_side / longer_side_after_scaling) if longer_side_after_scaling > image_max_side else 1
+        scale = scale_for_shorter_side * scale_for_longer_side
+        transform = transforms.Compose([
+            transforms.Resize((round(image.height * scale), round(image.width * scale))),  # interpolation `BILINEAR` is applied by default
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        image = transform(image)
+        return image, scale
+    @staticmethod
+    def padding_collate_fn(batch: List[Tuple[str, Tensor, Tensor, Tensor, Tensor]]) -> Tuple[List[str], Tensor, Tensor, Tensor, Tensor]:
+        image_id_batch, image_batch, scale_batch, bboxes_batch, labels_batch = zip(*batch)
+        max_image_width = max([it.shape[2] for it in image_batch])
+        max_image_height = max([it.shape[1] for it in image_batch])
+        max_bboxes_length = max([len(it) for it in bboxes_batch])
+        max_labels_length = max([len(it) for it in labels_batch])
+        padded_image_batch = []
+        padded_bboxes_batch = []
+        padded_labels_batch = []
+        for image in image_batch:
+            padded_image = F.pad(input=image, pad=(0, max_image_width - image.shape[2], 0, max_image_height - image.shape[1]))  # pad has format (left, right, top, bottom)
+            padded_image_batch.append(padded_image)
+        for bboxes in bboxes_batch:
+            padded_bboxes = torch.cat([bboxes, torch.zeros(max_bboxes_length - len(bboxes), 4).to(bboxes)])
+            padded_bboxes_batch.append(padded_bboxes)
+        for labels in labels_batch:
+            padded_labels = torch.cat([labels, torch.zeros(max_labels_length - len(labels)).to(labels)])
+            padded_labels_batch.append(padded_labels)
+        image_id_batch = list(image_id_batch)
+        padded_image_batch = torch.stack(padded_image_batch, dim=0)
+        scale_batch = torch.stack(scale_batch, dim=0)
+        padded_bboxes_batch = torch.stack(padded_bboxes_batch, dim=0)
+        padded_labels_batch = torch.stack(padded_labels_batch, dim=0)
+        return image_id_batch, padded_image_batch, scale_batch, padded_bboxes_batch, padded_labels_batch
+    class NearestRatioRandomSampler(torch.utils.data.sampler.Sampler):
+        def __init__(self, image_ratios: List[float], num_neighbors: int):
+            super().__init__(data_source=None)
+            self._image_ratios = image_ratios
+            self._num_neighbors = num_neighbors
+        def __len__(self) -> int:
+            return len(self._image_ratios)
+        def __iter__(self) -> Iterator[int]:
+            image_ratios = torch.tensor(self._image_ratios)
+            tall_indices = (image_ratios < 1).nonzero().view(-1)
+            fat_indices = (image_ratios >= 1).nonzero().view(-1)
+            tall_indices_length = len(tall_indices)
+            fat_indices_length = len(fat_indices)
+            tall_indices = tall_indices[torch.randperm(tall_indices_length)]
+            fat_indices = fat_indices[torch.randperm(fat_indices_length)]
+            num_tall_remainder = tall_indices_length % self._num_neighbors
+            num_fat_remainder = fat_indices_length % self._num_neighbors
+            tall_indices = tall_indices[:tall_indices_length - num_tall_remainder]
+            fat_indices = fat_indices[:fat_indices_length - num_fat_remainder]
+            tall_indices = tall_indices.view(-1, self._num_neighbors)
+            fat_indices = fat_indices.view(-1, self._num_neighbors)
+            merge_indices = torch.cat([tall_indices, fat_indices], dim=0)
+            merge_indices = merge_indices[torch.randperm(len(merge_indices))].view(-1)
+            return iter(merge_indices.tolist())

dataset/coco2017.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import json
+import os
+import pickle
+import random
+from typing import List, Tuple, Dict
+import torch
+import torch.utils.data.dataset
+from PIL import Image, ImageOps
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from torch import Tensor
+from torchvision.datasets import CocoDetection
+from tqdm import tqdm
+from bbox import BBox
+from dataset.base import Base
+from io import StringIO
+import sys
+class COCO2017(Base):
+    class Annotation(object):
+        class Object(object):
+            def __init__(self, bbox: BBox, label: int):
+                super().__init__()
+                self.bbox = bbox
+                self.label = label
+            def __repr__(self) -> str:
+                return 'Object[label={:d}, bbox={!s}]'.format(
+                    self.label, self.bbox)
+        def __init__(self, filename: str, objects: List[Object]):
+            super().__init__()
+            self.filename = filename
+            self.objects = objects
+    CATEGORY_TO_LABEL_DICT = {
+        'background': 0, 'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4,
+        'airplane': 5, 'bus': 6, 'train': 7, 'truck': 8, 'boat': 9,
+        'traffic light': 10, 'fire hydrant': 11, 'street sign': 12, 'stop sign': 13, 'parking meter': 14,
+        'bench': 15, 'bird': 16, 'cat': 17, 'dog': 18, 'horse': 19,
+        'sheep': 20, 'cow': 21, 'elephant': 22, 'bear': 23, 'zebra': 24,
+        'giraffe': 25, 'hat': 26, 'backpack': 27, 'umbrella': 28, 'shoe': 29,
+        'eye glasses': 30, 'handbag': 31, 'tie': 32, 'suitcase': 33, 'frisbee': 34,
+        'skis': 35, 'snowboard': 36, 'sports ball': 37, 'kite': 38, 'baseball bat': 39,
+        'baseball glove': 40, 'skateboard': 41, 'surfboard': 42, 'tennis racket': 43, 'bottle': 44,
+        'plate': 45, 'wine glass': 46, 'cup': 47, 'fork': 48, 'knife': 49,
+        'spoon': 50, 'bowl': 51, 'banana': 52, 'apple': 53, 'sandwich': 54,
+        'orange': 55, 'broccoli': 56, 'carrot': 57, 'hot dog': 58, 'pizza': 59,
+        'donut': 60, 'cake': 61, 'chair': 62, 'couch': 63, 'potted plant': 64,
+        'bed': 65, 'mirror': 66, 'dining table': 67, 'window': 68, 'desk': 69,
+        'toilet': 70, 'door': 71, 'tv': 72, 'laptop': 73, 'mouse': 74,
+        'remote': 75, 'keyboard': 76, 'cell phone': 77, 'microwave': 78, 'oven': 79,
+        'toaster': 80, 'sink': 81, 'refrigerator': 82, 'blender': 83, 'book': 84,
+        'clock': 85, 'vase': 86, 'scissors': 87, 'teddy bear': 88, 'hair drier': 89,
+        'toothbrush': 90, 'hair brush': 91
+    }
+    LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
+    def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
+        super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_caches_dir = os.path.join('caches', 'coco2017', f'{self._mode.value}')
+        path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
+        path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
+        path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
+        if self._mode == COCO2017.Mode.TRAIN:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
+        elif self._mode == COCO2017.Mode.EVAL:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        else:
+            raise ValueError('invalid mode')
+        coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
+        if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
+            print('loading cache files...')
+            with open(path_to_image_ids_pickle, 'rb') as f:
+                self._image_ids = pickle.load(f)
+            with open(path_to_image_id_dict_pickle, 'rb') as f:
+                self._image_id_to_annotation_dict = pickle.load(f)
+            with open(path_to_image_ratios_pickle, 'rb') as f:
+                self._image_ratios = pickle.load(f)
+        else:
+            print('generating cache files...')
+            os.makedirs(path_to_caches_dir, exist_ok=True)
+            self._image_ids: List[str] = []
+            self._image_id_to_annotation_dict: Dict[str, COCO2017.Annotation] = {}
+            self._image_ratios = []
+            for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
+                if len(annotation) > 0:
+                    image_id = str(annotation[0]['image_id'])  # all image_id in annotation are the same
+                    self._image_ids.append(image_id)
+                    self._image_id_to_annotation_dict[image_id] = COCO2017.Annotation(
+                        filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
+                        objects=[COCO2017.Annotation.Object(
+                            bbox=BBox(  # `ann['bbox']` is in the format [left, top, width, height]
+                                left=ann['bbox'][0],
+                                top=ann['bbox'][1],
+                                right=ann['bbox'][0] + ann['bbox'][2],
+                                bottom=ann['bbox'][1] + ann['bbox'][3]
+                            ),
+                            label=ann['category_id'])
+                            for ann in annotation]
+                    )
+                    ratio = float(image.width / image.height)
+                    self._image_ratios.append(ratio)
+            with open(path_to_image_ids_pickle, 'wb') as f:
+                pickle.dump(self._image_ids, f)
+            with open(path_to_image_id_dict_pickle, 'wb') as f:
+                pickle.dump(self._image_id_to_annotation_dict, f)
+            with open(path_to_image_ratios_pickle, 'wb') as f:
+                pickle.dump(self.image_ratios, f)
+    def __len__(self) -> int:
+        return len(self._image_id_to_annotation_dict)
+    def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
+        image_id = self._image_ids[index]
+        annotation = self._image_id_to_annotation_dict[image_id]
+        bboxes = [obj.bbox.tolist() for obj in annotation.objects]
+        labels = [obj.label for obj in annotation.objects]
+        bboxes = torch.tensor(bboxes, dtype=torch.float)
+        labels = torch.tensor(labels, dtype=torch.long)
+        image = Image.open(annotation.filename).convert('RGB')  # for some grayscale images
+        # random flip on only training mode
+        if self._mode == COCO2017.Mode.TRAIN and random.random() > 0.5:
+            image = ImageOps.mirror(image)
+            bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]]  # index 0 and 2 represent `left` and `right` respectively
+        image, scale = COCO2017.preprocess(image, self._image_min_side, self._image_max_side)
+        scale = torch.tensor(scale, dtype=torch.float)
+        bboxes *= scale
+        return image_id, image, scale, bboxes, labels
+    def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
+        self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
+        annType = 'bbox'
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        cocoGt = COCO(path_to_annotation)
+        cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
+        cocoEval = COCOeval(cocoGt, cocoDt, annType)
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        original_stdout = sys.stdout
+        string_stdout = StringIO()
+        sys.stdout = string_stdout
+        cocoEval.summarize()
+        sys.stdout = original_stdout
+        mean_ap = cocoEval.stats[0].item()  # stats[0] records AP@[0.5:0.95]
+        detail = string_stdout.getvalue()
+        return mean_ap, detail
+    def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
+        results = []
+        for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
+            results.append(
+                {
+                    'image_id': int(image_id),  # COCO evaluation requires `image_id` to be type `int`
+                    'category_id': cls,
+                    'bbox': [   # format [left, top, width, height] is expected
+                        bbox[0],
+                        bbox[1],
+                        bbox[2] - bbox[0],
+                        bbox[3] - bbox[1]
+                    ],
+                    'score': prob
+                }
+            )
+        with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
+            json.dump(results, f)
+    @property
+    def image_ratios(self) -> List[float]:
+        return self._image_ratios
+    @staticmethod
+    def num_classes() -> int:
+        return 92

dataset/coco2017_animal.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import json
+import os
+import pickle
+import random
+import sys
+from io import StringIO
+from typing import List, Tuple, Dict
+import torch
+import torch.utils.data.dataset
+from PIL import Image, ImageOps
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from torch import Tensor
+from torchvision.datasets import CocoDetection
+from tqdm import tqdm
+from bbox import BBox
+from dataset.base import Base
+from dataset.coco2017 import COCO2017
+class COCO2017Animal(Base):
+    class Annotation(object):
+        class Object(object):
+            def __init__(self, bbox: BBox, label: int):
+                super().__init__()
+                self.bbox = bbox
+                self.label = label
+            def __repr__(self) -> str:
+                return 'Object[label={:d}, bbox={!s}]'.format(
+                    self.label, self.bbox)
+        def __init__(self, filename: str, objects: List[Object]):
+            super().__init__()
+            self.filename = filename
+            self.objects = objects
+    CATEGORY_TO_LABEL_DICT = {
+        'background': 0,
+        'bird': 1, 'cat': 2, 'dog': 3, 'horse': 4, 'sheep': 5,
+        'cow': 6, 'elephant': 7, 'bear': 8, 'zebra': 9, 'giraffe': 10
+    }
+    LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
+    def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
+        super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_caches_dir = os.path.join('caches', 'coco2017-animal', f'{self._mode.value}')
+        path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
+        path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
+        path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
+        if self._mode == COCO2017Animal.Mode.TRAIN:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
+        elif self._mode == COCO2017Animal.Mode.EVAL:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        else:
+            raise ValueError('invalid mode')
+        coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
+        if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
+            print('loading cache files...')
+            with open(path_to_image_ids_pickle, 'rb') as f:
+                self._image_ids = pickle.load(f)
+            with open(path_to_image_id_dict_pickle, 'rb') as f:
+                self._image_id_to_annotation_dict = pickle.load(f)
+            with open(path_to_image_ratios_pickle, 'rb') as f:
+                self._image_ratios = pickle.load(f)
+        else:
+            print('generating cache files...')
+            os.makedirs(path_to_caches_dir, exist_ok=True)
+            self._image_id_to_annotation_dict: Dict[str, COCO2017Animal.Annotation] = {}
+            self._image_ratios = []
+            for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
+                if len(annotation) > 0:
+                    image_id = str(annotation[0]['image_id'])  # all image_id in annotation are the same
+                    annotation = COCO2017Animal.Annotation(
+                        filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
+                        objects=[COCO2017Animal.Annotation.Object(
+                            bbox=BBox(  # `ann['bbox']` is in the format [left, top, width, height]
+                                left=ann['bbox'][0],
+                                top=ann['bbox'][1],
+                                right=ann['bbox'][0] + ann['bbox'][2],
+                                bottom=ann['bbox'][1] + ann['bbox'][3]
+                            ),
+                            label=ann['category_id'])
+                            for ann in annotation]
+                    )
+                    annotation.objects = [obj for obj in annotation.objects
+                                          if obj.label in [COCO2017.CATEGORY_TO_LABEL_DICT[category]  # filtering label should refer to original `COCO2017` dataset
+                                                           for category in COCO2017Animal.CATEGORY_TO_LABEL_DICT.keys()][1:]]
+                    if len(annotation.objects) > 0:
+                        self._image_id_to_annotation_dict[image_id] = annotation
+                        ratio = float(image.width / image.height)
+                        self._image_ratios.append(ratio)
+            self._image_ids = list(self._image_id_to_annotation_dict.keys())
+            with open(path_to_image_ids_pickle, 'wb') as f:
+                pickle.dump(self._image_ids, f)
+            with open(path_to_image_id_dict_pickle, 'wb') as f:
+                pickle.dump(self._image_id_to_annotation_dict, f)
+            with open(path_to_image_ratios_pickle, 'wb') as f:
+                pickle.dump(self.image_ratios, f)
+    def __len__(self) -> int:
+        return len(self._image_id_to_annotation_dict)
+    def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
+        image_id = self._image_ids[index]
+        annotation = self._image_id_to_annotation_dict[image_id]
+        bboxes = [obj.bbox.tolist() for obj in annotation.objects]
+        labels = [COCO2017Animal.CATEGORY_TO_LABEL_DICT[COCO2017.LABEL_TO_CATEGORY_DICT[obj.label]] for obj in annotation.objects]  # mapping from original `COCO2017` dataset
+        bboxes = torch.tensor(bboxes, dtype=torch.float)
+        labels = torch.tensor(labels, dtype=torch.long)
+        image = Image.open(annotation.filename).convert('RGB')  # for some grayscale images
+        # random flip on only training mode
+        if self._mode == COCO2017Animal.Mode.TRAIN and random.random() > 0.5:
+            image = ImageOps.mirror(image)
+            bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]]  # index 0 and 2 represent `left` and `right` respectively
+        image, scale = COCO2017Animal.preprocess(image, self._image_min_side, self._image_max_side)
+        scale = torch.tensor(scale, dtype=torch.float)
+        bboxes *= scale
+        return image_id, image, scale, bboxes, labels
+    def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
+        self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
+        annType = 'bbox'
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        cocoGt = COCO(path_to_annotation)
+        cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
+        cocoEval = COCOeval(cocoGt, cocoDt, annType)
+        cocoEval.params.catIds = [COCO2017.CATEGORY_TO_LABEL_DICT[category]  # filtering label should refer to original `COCO2017` dataset
+                                  for category in COCO2017Animal.CATEGORY_TO_LABEL_DICT.keys()]
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        original_stdout = sys.stdout
+        string_stdout = StringIO()
+        sys.stdout = string_stdout
+        cocoEval.summarize()
+        sys.stdout = original_stdout
+        mean_ap = cocoEval.stats[0].item()  # stats[0] records AP@[0.5:0.95]
+        detail = string_stdout.getvalue()
+        return mean_ap, detail
+    def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
+        results = []
+        for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
+            results.append(
+                {
+                    'image_id': int(image_id),  # COCO evaluation requires `image_id` to be type `int`
+                    'category_id': COCO2017.CATEGORY_TO_LABEL_DICT[COCO2017Animal.LABEL_TO_CATEGORY_DICT[cls]],  # mapping to original `COCO2017` dataset
+                    'bbox': [   # format [left, top, width, height] is expected
+                        bbox[0],
+                        bbox[1],
+                        bbox[2] - bbox[0],
+                        bbox[3] - bbox[1]
+                    ],
+                    'score': prob
+                }
+            )
+        with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
+            json.dump(results, f)
+    @property
+    def image_ratios(self) -> List[float]:
+        return self._image_ratios
+    @staticmethod
+    def num_classes() -> int:
+        return 11

dataset/coco2017_car.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import json
+import os
+import pickle
+import random
+import sys
+from io import StringIO
+from typing import List, Tuple, Dict
+import torch
+import torch.utils.data.dataset
+from PIL import Image, ImageOps
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from torch import Tensor
+from torchvision.datasets import CocoDetection
+from tqdm import tqdm
+from bbox import BBox
+from dataset.base import Base
+from dataset.coco2017 import COCO2017
+class COCO2017Car(Base):
+    class Annotation(object):
+        class Object(object):
+            def __init__(self, bbox: BBox, label: int):
+                super().__init__()
+                self.bbox = bbox
+                self.label = label
+            def __repr__(self) -> str:
+                return 'Object[label={:d}, bbox={!s}]'.format(
+                    self.label, self.bbox)
+        def __init__(self, filename: str, objects: List[Object]):
+            super().__init__()
+            self.filename = filename
+            self.objects = objects
+    CATEGORY_TO_LABEL_DICT = {
+        'background': 0, 'car': 1
+    }
+    LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
+    def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
+        super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_caches_dir = os.path.join('caches', 'coco2017-car', f'{self._mode.value}')
+        path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
+        path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
+        path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
+        if self._mode == COCO2017Car.Mode.TRAIN:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
+        elif self._mode == COCO2017Car.Mode.EVAL:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        else:
+            raise ValueError('invalid mode')
+        coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
+        if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
+            print('loading cache files...')
+            with open(path_to_image_ids_pickle, 'rb') as f:
+                self._image_ids = pickle.load(f)
+            with open(path_to_image_id_dict_pickle, 'rb') as f:
+                self._image_id_to_annotation_dict = pickle.load(f)
+            with open(path_to_image_ratios_pickle, 'rb') as f:
+                self._image_ratios = pickle.load(f)
+        else:
+            print('generating cache files...')
+            os.makedirs(path_to_caches_dir, exist_ok=True)
+            self._image_id_to_annotation_dict: Dict[str, COCO2017Car.Annotation] = {}
+            self._image_ratios = []
+            for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
+                if len(annotation) > 0:
+                    image_id = str(annotation[0]['image_id'])  # all image_id in annotation are the same
+                    annotation = COCO2017Car.Annotation(
+                        filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
+                        objects=[COCO2017Car.Annotation.Object(
+                            bbox=BBox(  # `ann['bbox']` is in the format [left, top, width, height]
+                                left=ann['bbox'][0],
+                                top=ann['bbox'][1],
+                                right=ann['bbox'][0] + ann['bbox'][2],
+                                bottom=ann['bbox'][1] + ann['bbox'][3]
+                            ),
+                            label=ann['category_id'])
+                            for ann in annotation]
+                    )
+                    annotation.objects = [obj for obj in annotation.objects
+                                          if obj.label in [COCO2017.CATEGORY_TO_LABEL_DICT['car']]]  # filtering label should refer to original `COCO2017` dataset
+                    if len(annotation.objects) > 0:
+                        self._image_id_to_annotation_dict[image_id] = annotation
+                        ratio = float(image.width / image.height)
+                        self._image_ratios.append(ratio)
+            self._image_ids = list(self._image_id_to_annotation_dict.keys())
+            with open(path_to_image_ids_pickle, 'wb') as f:
+                pickle.dump(self._image_ids, f)
+            with open(path_to_image_id_dict_pickle, 'wb') as f:
+                pickle.dump(self._image_id_to_annotation_dict, f)
+            with open(path_to_image_ratios_pickle, 'wb') as f:
+                pickle.dump(self.image_ratios, f)
+    def __len__(self) -> int:
+        return len(self._image_id_to_annotation_dict)
+    def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
+        image_id = self._image_ids[index]
+        annotation = self._image_id_to_annotation_dict[image_id]
+        bboxes = [obj.bbox.tolist() for obj in annotation.objects]
+        labels = [COCO2017Car.CATEGORY_TO_LABEL_DICT[COCO2017.LABEL_TO_CATEGORY_DICT[obj.label]] for obj in annotation.objects]  # mapping from original `COCO2017` dataset
+        bboxes = torch.tensor(bboxes, dtype=torch.float)
+        labels = torch.tensor(labels, dtype=torch.long)
+        image = Image.open(annotation.filename).convert('RGB')  # for some grayscale images
+        # random flip on only training mode
+        if self._mode == COCO2017Car.Mode.TRAIN and random.random() > 0.5:
+            image = ImageOps.mirror(image)
+            bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]]  # index 0 and 2 represent `left` and `right` respectively
+        image, scale = COCO2017Car.preprocess(image, self._image_min_side, self._image_max_side)
+        scale = torch.tensor(scale, dtype=torch.float)
+        bboxes *= scale
+        return image_id, image, scale, bboxes, labels
+    def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
+        self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
+        annType = 'bbox'
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        cocoGt = COCO(path_to_annotation)
+        cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
+        cocoEval = COCOeval(cocoGt, cocoDt, annType)
+        cocoEval.params.catIds = COCO2017.CATEGORY_TO_LABEL_DICT['car']  # filtering label should refer to original `COCO2017` dataset
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        original_stdout = sys.stdout
+        string_stdout = StringIO()
+        sys.stdout = string_stdout
+        cocoEval.summarize()
+        sys.stdout = original_stdout
+        mean_ap = cocoEval.stats[0].item()  # stats[0] records AP@[0.5:0.95]
+        detail = string_stdout.getvalue()
+        return mean_ap, detail
+    def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
+        results = []
+        for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
+            results.append(
+                {
+                    'image_id': int(image_id),  # COCO evaluation requires `image_id` to be type `int`
+                    'category_id': COCO2017.CATEGORY_TO_LABEL_DICT[COCO2017Car.LABEL_TO_CATEGORY_DICT[cls]],  # mapping to original `COCO2017` dataset
+                    'bbox': [   # format [left, top, width, height] is expected
+                        bbox[0],
+                        bbox[1],
+                        bbox[2] - bbox[0],
+                        bbox[3] - bbox[1]
+                    ],
+                    'score': prob
+                }
+            )
+        with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
+            json.dump(results, f)
+    @property
+    def image_ratios(self) -> List[float]:
+        return self._image_ratios
+    @staticmethod
+    def num_classes() -> int:
+        return 2

dataset/coco2017_person.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import json
+import os
+import pickle
+import random
+import sys
+from io import StringIO
+from typing import List, Tuple, Dict
+import torch
+import torch.utils.data.dataset
+from PIL import Image, ImageOps
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from torch import Tensor
+from torchvision.datasets import CocoDetection
+from tqdm import tqdm
+from bbox import BBox
+from dataset.base import Base
+from dataset.coco2017 import COCO2017
+class COCO2017Person(Base):
+    class Annotation(object):
+        class Object(object):
+            def __init__(self, bbox: BBox, label: int):
+                super().__init__()
+                self.bbox = bbox
+                self.label = label
+            def __repr__(self) -> str:
+                return 'Object[label={:d}, bbox={!s}]'.format(
+                    self.label, self.bbox)
+        def __init__(self, filename: str, objects: List[Object]):
+            super().__init__()
+            self.filename = filename
+            self.objects = objects
+    CATEGORY_TO_LABEL_DICT = {
+        'background': 0, 'person': 1
+    }
+    LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
+    def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
+        super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_caches_dir = os.path.join('caches', 'coco2017-person', f'{self._mode.value}')
+        path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
+        path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
+        path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
+        if self._mode == COCO2017Person.Mode.TRAIN:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
+        elif self._mode == COCO2017Person.Mode.EVAL:
+            path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
+            path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        else:
+            raise ValueError('invalid mode')
+        coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
+        if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
+            print('loading cache files...')
+            with open(path_to_image_ids_pickle, 'rb') as f:
+                self._image_ids = pickle.load(f)
+            with open(path_to_image_id_dict_pickle, 'rb') as f:
+                self._image_id_to_annotation_dict = pickle.load(f)
+            with open(path_to_image_ratios_pickle, 'rb') as f:
+                self._image_ratios = pickle.load(f)
+        else:
+            print('generating cache files...')
+            os.makedirs(path_to_caches_dir, exist_ok=True)
+            self._image_id_to_annotation_dict: Dict[str, COCO2017Person.Annotation] = {}
+            self._image_ratios = []
+            for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
+                if len(annotation) > 0:
+                    image_id = str(annotation[0]['image_id'])  # all image_id in annotation are the same
+                    annotation = COCO2017Person.Annotation(
+                        filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
+                        objects=[COCO2017Person.Annotation.Object(
+                            bbox=BBox(  # `ann['bbox']` is in the format [left, top, width, height]
+                                left=ann['bbox'][0],
+                                top=ann['bbox'][1],
+                                right=ann['bbox'][0] + ann['bbox'][2],
+                                bottom=ann['bbox'][1] + ann['bbox'][3]
+                            ),
+                            label=ann['category_id'])
+                            for ann in annotation]
+                    )
+                    annotation.objects = [obj for obj in annotation.objects
+                                          if obj.label in [COCO2017.CATEGORY_TO_LABEL_DICT['person']]]  # filtering label should refer to original `COCO2017` dataset
+                    if len(annotation.objects) > 0:
+                        self._image_id_to_annotation_dict[image_id] = annotation
+                        ratio = float(image.width / image.height)
+                        self._image_ratios.append(ratio)
+            self._image_ids = list(self._image_id_to_annotation_dict.keys())
+            with open(path_to_image_ids_pickle, 'wb') as f:
+                pickle.dump(self._image_ids, f)
+            with open(path_to_image_id_dict_pickle, 'wb') as f:
+                pickle.dump(self._image_id_to_annotation_dict, f)
+            with open(path_to_image_ratios_pickle, 'wb') as f:
+                pickle.dump(self.image_ratios, f)
+    def __len__(self) -> int:
+        return len(self._image_id_to_annotation_dict)
+    def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
+        image_id = self._image_ids[index]
+        annotation = self._image_id_to_annotation_dict[image_id]
+        bboxes = [obj.bbox.tolist() for obj in annotation.objects]
+        labels = [COCO2017Person.CATEGORY_TO_LABEL_DICT[COCO2017.LABEL_TO_CATEGORY_DICT[obj.label]] for obj in annotation.objects]  # mapping from original `COCO2017` dataset
+        bboxes = torch.tensor(bboxes, dtype=torch.float)
+        labels = torch.tensor(labels, dtype=torch.long)
+        image = Image.open(annotation.filename).convert('RGB')  # for some grayscale images
+        # random flip on only training mode
+        if self._mode == COCO2017Person.Mode.TRAIN and random.random() > 0.5:
+            image = ImageOps.mirror(image)
+            bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]]  # index 0 and 2 represent `left` and `right` respectively
+        image, scale = COCO2017Person.preprocess(image, self._image_min_side, self._image_max_side)
+        scale = torch.tensor(scale, dtype=torch.float)
+        bboxes *= scale
+        return image_id, image, scale, bboxes, labels
+    def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
+        self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
+        annType = 'bbox'
+        path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
+        path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
+        path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
+        cocoGt = COCO(path_to_annotation)
+        cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
+        cocoEval = COCOeval(cocoGt, cocoDt, annType)
+        cocoEval.params.catIds = COCO2017.CATEGORY_TO_LABEL_DICT['person']  # filtering label should refer to original `COCO2017` dataset
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        original_stdout = sys.stdout
+        string_stdout = StringIO()
+        sys.stdout = string_stdout
+        cocoEval.summarize()
+        sys.stdout = original_stdout
+        mean_ap = cocoEval.stats[0].item()  # stats[0] records AP@[0.5:0.95]
+        detail = string_stdout.getvalue()
+        return mean_ap, detail
+    def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
+        results = []
+        for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
+            results.append(
+                {
+                    'image_id': int(image_id),  # COCO evaluation requires `image_id` to be type `int`
+                    'category_id': COCO2017.CATEGORY_TO_LABEL_DICT[COCO2017Person.LABEL_TO_CATEGORY_DICT[cls]],  # mapping to original `COCO2017` dataset
+                    'bbox': [   # format [left, top, width, height] is expected
+                        bbox[0],
+                        bbox[1],
+                        bbox[2] - bbox[0],
+                        bbox[3] - bbox[1]
+                    ],
+                    'score': prob
+                }
+            )
+        with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
+            json.dump(results, f)
+    @property
+    def image_ratios(self) -> List[float]:
+        return self._image_ratios
+    @staticmethod
+    def num_classes() -> int:
+        return 2

dataset/voc2007.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import random
+import xml.etree.ElementTree as ET
+from typing import List, Tuple
+import numpy as np
+import torch.utils.data
+from PIL import Image, ImageOps
+from torch import Tensor
+from bbox import BBox
+from dataset.base import Base
+from voc_eval import voc_eval
+class VOC2007(Base):
+    class Annotation(object):
+        class Object(object):
+            def __init__(self, name: str, difficult: bool, bbox: BBox):
+                super().__init__()
+                self.name = name
+                self.difficult = difficult
+                self.bbox = bbox
+            def __repr__(self) -> str:
+                return 'Object[name={:s}, difficult={!s}, bbox={!s}]'.format(
+                    self.name, self.difficult, self.bbox)
+        def __init__(self, filename: str, objects: List[Object]):
+            super().__init__()
+            self.filename = filename
+            self.objects = objects
+    CATEGORY_TO_LABEL_DICT = {
+        'background': 0,
+        'aeroplane': 1, 'bicycle': 2, 'bird': 3, 'boat': 4, 'bottle': 5,
+        'bus': 6, 'car': 7, 'cat': 8, 'chair': 9, 'cow': 10,
+        'diningtable': 11, 'dog': 12, 'horse': 13, 'motorbike': 14, 'person': 15,
+        'pottedplant': 16, 'sheep': 17, 'sofa': 18, 'train': 19, 'tvmonitor': 20
+    }
+    LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
+    def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
+        super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
+        path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
+        path_to_imagesets_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
+        path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
+        self._path_to_jpeg_images_dir = os.path.join(path_to_voc2007_dir, 'JPEGImages')
+        if self._mode == VOC2007.Mode.TRAIN:
+            path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'trainval.txt')
+        elif self._mode == VOC2007.Mode.EVAL:
+            path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'test.txt')
+        else:
+            raise ValueError('invalid mode')
+        with open(path_to_image_ids_txt, 'r') as f:
+            lines = f.readlines()
+            self._image_ids = [line.rstrip() for line in lines]
+        self._image_id_to_annotation_dict = {}
+        self._image_ratios = []
+        for image_id in self._image_ids:
+            path_to_annotation_xml = os.path.join(path_to_annotations_dir, f'{image_id}.xml')
+            tree = ET.ElementTree(file=path_to_annotation_xml)
+            root = tree.getroot()
+            self._image_id_to_annotation_dict[image_id] = VOC2007.Annotation(
+                filename=root.find('filename').text,
+                objects=[VOC2007.Annotation.Object(
+                    name=next(tag_object.iterfind('name')).text,
+                    difficult=next(tag_object.iterfind('difficult')).text == '1',
+                    bbox=BBox(  # convert to 0-based pixel index
+                        left=float(next(tag_object.iterfind('bndbox/xmin')).text) - 1,
+                        top=float(next(tag_object.iterfind('bndbox/ymin')).text) - 1,
+                        right=float(next(tag_object.iterfind('bndbox/xmax')).text) - 1,
+                        bottom=float(next(tag_object.iterfind('bndbox/ymax')).text) - 1
+                    )
+                ) for tag_object in root.iterfind('object')]
+            )
+            width = int(root.find('size/width').text)
+            height = int(root.find('size/height').text)
+            ratio = float(width / height)
+            self._image_ratios.append(ratio)
+    def __len__(self) -> int:
+        return len(self._image_id_to_annotation_dict)
+    def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
+        image_id = self._image_ids[index]
+        annotation = self._image_id_to_annotation_dict[image_id]
+        bboxes = [obj.bbox.tolist() for obj in annotation.objects if not obj.difficult]
+        labels = [VOC2007.CATEGORY_TO_LABEL_DICT[obj.name] for obj in annotation.objects if not obj.difficult]
+        bboxes = torch.tensor(bboxes, dtype=torch.float)
+        labels = torch.tensor(labels, dtype=torch.long)
+        image = Image.open(os.path.join(self._path_to_jpeg_images_dir, annotation.filename))
+        # random flip on only training mode
+        if self._mode == VOC2007.Mode.TRAIN and random.random() > 0.5:
+            image = ImageOps.mirror(image)
+            bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]]  # index 0 and 2 represent `left` and `right` respectively
+        image, scale = VOC2007.preprocess(image, self._image_min_side, self._image_max_side)
+        scale = torch.tensor(scale, dtype=torch.float)
+        bboxes *= scale
+        return image_id, image, scale, bboxes, labels
+    def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
+        self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
+        path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
+        path_to_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
+        path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
+        class_to_ap_dict = {}
+        for c in range(1, VOC2007.num_classes()):
+            category = VOC2007.LABEL_TO_CATEGORY_DICT[c]
+            try:
+                path_to_cache_dir = os.path.join('caches', 'voc2007')
+                os.makedirs(path_to_cache_dir, exist_ok=True)
+                _, _, ap = voc_eval(detpath=path_to_results_dir+'/comp3_det_test_{:s}.txt'.format(category),
+                                    annopath=path_to_annotations_dir+'/{:s}.xml',
+                                    imagesetfile=os.path.join(path_to_main_dir, 'test.txt'),
+                                    classname=category,
+                                    cachedir=path_to_cache_dir,
+                                    ovthresh=0.5,
+                                    use_07_metric=True)
+            except IndexError:
+                ap = 0
+            class_to_ap_dict[c] = ap
+        mean_ap = np.mean([v for k, v in class_to_ap_dict.items()]).item()
+        detail = ''
+        for c in range(1, VOC2007.num_classes()):
+            detail += '{:d}: {:s} AP = {:.4f}\n'.format(c, VOC2007.LABEL_TO_CATEGORY_DICT[c], class_to_ap_dict[c])
+        return mean_ap, detail
+    def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
+        class_to_txt_files_dict = {}
+        for c in range(1, VOC2007.num_classes()):
+            class_to_txt_files_dict[c] = open(os.path.join(path_to_results_dir, 'comp3_det_test_{:s}.txt'.format(VOC2007.LABEL_TO_CATEGORY_DICT[c])), 'w')
+        for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
+            class_to_txt_files_dict[cls].write('{:s} {:f} {:f} {:f} {:f} {:f}\n'.format(image_id, prob,
+                                                                                        bbox[0], bbox[1], bbox[2], bbox[3]))
+        for _, f in class_to_txt_files_dict.items():
+            f.close()
+    @property
+    def image_ratios(self) -> List[float]:
+        return self._image_ratios
+    @staticmethod
+    def num_classes() -> int:
+        return 21

dataset/voc2007_cat_dog.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import random
+import xml.etree.ElementTree as ET
+from typing import List, Tuple
+import numpy as np
+import torch.utils.data
+from PIL import Image, ImageOps
+from torch import Tensor
+from bbox import BBox
+from dataset.base import Base
+from voc_eval import voc_eval
+class VOC2007CatDog(Base):
+    class Annotation(object):
+        class Object(object):
+            def __init__(self, name: str, difficult: bool, bbox: BBox):
+                super().__init__()
+                self.name = name
+                self.difficult = difficult
+                self.bbox = bbox
+            def __repr__(self) -> str:
+                return 'Object[name={:s}, difficult={!s}, bbox={!s}]'.format(
+                    self.name, self.difficult, self.bbox)
+        def __init__(self, filename: str, objects: List[Object]):
+            super().__init__()
+            self.filename = filename
+            self.objects = objects
+    CATEGORY_TO_LABEL_DICT = {
+        'background': 0,
+        'cat': 1, 'dog': 2
+    }
+    LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
+    def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
+        super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
+        path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
+        path_to_imagesets_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
+        path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
+        self._path_to_jpeg_images_dir = os.path.join(path_to_voc2007_dir, 'JPEGImages')
+        if self._mode == VOC2007CatDog.Mode.TRAIN:
+            path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'trainval.txt')
+        elif self._mode == VOC2007CatDog.Mode.EVAL:
+            path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'test.txt')
+        else:
+            raise ValueError('invalid mode')
+        with open(path_to_image_ids_txt, 'r') as f:
+            lines = f.readlines()
+            image_ids = [line.rstrip() for line in lines]
+        self._image_id_to_annotation_dict = {}
+        self._image_ratios = []
+        for image_id in image_ids:
+            path_to_annotation_xml = os.path.join(path_to_annotations_dir, f'{image_id}.xml')
+            tree = ET.ElementTree(file=path_to_annotation_xml)
+            root = tree.getroot()
+            annotation = VOC2007CatDog.Annotation(
+                filename=root.find('filename').text,
+                objects=[VOC2007CatDog.Annotation.Object(
+                    name=next(tag_object.iterfind('name')).text,
+                    difficult=next(tag_object.iterfind('difficult')).text == '1',
+                    bbox=BBox(  # convert to 0-based pixel index
+                        left=float(next(tag_object.iterfind('bndbox/xmin')).text) - 1,
+                        top=float(next(tag_object.iterfind('bndbox/ymin')).text) - 1,
+                        right=float(next(tag_object.iterfind('bndbox/xmax')).text) - 1,
+                        bottom=float(next(tag_object.iterfind('bndbox/ymax')).text) - 1
+                    )
+                ) for tag_object in root.iterfind('object')]
+            )
+            annotation.objects = [obj for obj in annotation.objects if obj.name in ['cat', 'dog'] and not obj.difficult]
+            if len(annotation.objects) > 0:
+                self._image_id_to_annotation_dict[image_id] = annotation
+                width = int(root.find('size/width').text)
+                height = int(root.find('size/height').text)
+                ratio = float(width / height)
+                self._image_ratios.append(ratio)
+        self._image_ids = list(self._image_id_to_annotation_dict.keys())
+    def __len__(self) -> int:
+        return len(self._image_id_to_annotation_dict)
+    def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
+        image_id = self._image_ids[index]
+        annotation = self._image_id_to_annotation_dict[image_id]
+        bboxes = [obj.bbox.tolist() for obj in annotation.objects]
+        labels = [VOC2007CatDog.CATEGORY_TO_LABEL_DICT[obj.name] for obj in annotation.objects]
+        bboxes = torch.tensor(bboxes, dtype=torch.float)
+        labels = torch.tensor(labels, dtype=torch.long)
+        image = Image.open(os.path.join(self._path_to_jpeg_images_dir, annotation.filename))
+        # random flip on only training mode
+        if self._mode == VOC2007CatDog.Mode.TRAIN and random.random() > 0.5:
+            image = ImageOps.mirror(image)
+            bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]]  # index 0 and 2 represent `left` and `right` respectively
+        image, scale = VOC2007CatDog.preprocess(image, self._image_min_side, self._image_max_side)
+        scale = torch.tensor(scale, dtype=torch.float)
+        bboxes *= scale
+        return image_id, image, scale, bboxes, labels
+    def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
+        self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
+        path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
+        path_to_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
+        path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
+        class_to_ap_dict = {}
+        for c in range(1, VOC2007CatDog.num_classes()):
+            category = VOC2007CatDog.LABEL_TO_CATEGORY_DICT[c]
+            try:
+                path_to_cache_dir = os.path.join('caches', 'voc2007-cat-dog')
+                os.makedirs(path_to_cache_dir, exist_ok=True)
+                _, _, ap = voc_eval(detpath=os.path.join(path_to_results_dir, 'comp3_det_test_{:s}.txt'.format(category)),
+                                    annopath=os.path.join(path_to_annotations_dir, '{:s}.xml'),
+                                    imagesetfile=os.path.join(path_to_main_dir, 'test.txt'),
+                                    classname=category,
+                                    cachedir=path_to_cache_dir,
+                                    ovthresh=0.5,
+                                    use_07_metric=True)
+            except IndexError:
+                ap = 0
+            class_to_ap_dict[c] = ap
+        mean_ap = np.mean([v for k, v in class_to_ap_dict.items()]).item()
+        detail = ''
+        for c in range(1, VOC2007CatDog.num_classes()):
+            detail += '{:d}: {:s} AP = {:.4f}\n'.format(c, VOC2007CatDog.LABEL_TO_CATEGORY_DICT[c], class_to_ap_dict[c])
+        return mean_ap, detail
+    def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
+        class_to_txt_files_dict = {}
+        for c in range(1, VOC2007CatDog.num_classes()):
+            class_to_txt_files_dict[c] = open(os.path.join(path_to_results_dir, 'comp3_det_test_{:s}.txt'.format(VOC2007CatDog.LABEL_TO_CATEGORY_DICT[c])), 'w')
+        for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
+            class_to_txt_files_dict[cls].write('{:s} {:f} {:f} {:f} {:f} {:f}\n'.format(image_id, prob,
+                                                                                        bbox[0], bbox[1], bbox[2], bbox[3]))
+        for _, f in class_to_txt_files_dict.items():
+            f.close()
+    @property
+    def image_ratios(self) -> List[float]:
+        return self._image_ratios
+    @staticmethod
+    def num_classes() -> int:
+        return 3

extension/functional.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch
+from torch import Tensor
+def beta_smooth_l1_loss(input: Tensor, target: Tensor, beta: float) -> Tensor:
+    diff = torch.abs(input - target)
+    loss = torch.where(diff < beta, 0.5 * diff ** 2 / beta, diff - 0.5 * beta)
+    loss = loss.sum() / (input.numel() + 1e-8)
+    return loss

extension/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import List
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import MultiStepLR
+class WarmUpMultiStepLR(MultiStepLR):
+    def __init__(self, optimizer: Optimizer, milestones: List[int], gamma: float = 0.1,
+                 factor: float = 0.3333, num_iters: int = 500, last_epoch: int = -1):
+        self.factor = factor
+        self.num_iters = num_iters
+        super().__init__(optimizer, milestones, gamma, last_epoch)
+    def get_lr(self) -> List[float]:
+        if self.last_epoch < self.num_iters:
+            alpha = self.last_epoch / self.num_iters
+            factor = (1 - self.factor) * alpha + self.factor
+            return [lr * factor for lr in super()._get_closed_form_lr()]
+        else:
+            factor = 1
+            return [lr for lr in super().get_lr()]
+        return [lr * factor for lr in super()._get_closed_form_lr()]

models/MobileNetSSD_deploy.caffemodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:761c86fbae3d8361dd454f7c740a964f62975ed32f4324b8b85994edec30f6af
+size 23147564

models/MobileNetSSD_deploy.prototxt.txt ADDED Viewed

	@@ -0,0 +1,1912 @@

+name: "MobileNet-SSD"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 300
+  dim: 300
+}
+layer {
+  name: "conv0"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv0"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv0/relu"
+  type: "ReLU"
+  bottom: "conv0"
+  top: "conv0"
+}
+layer {
+  name: "conv1/dw"
+  type: "Convolution"
+  bottom: "conv0"
+  top: "conv1/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 1
+    kernel_size: 3
+    group: 32
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv1/dw/relu"
+  type: "ReLU"
+  bottom: "conv1/dw"
+  top: "conv1/dw"
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "conv1/dw"
+  top: "conv1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv1/relu"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "conv2/dw"
+  type: "Convolution"
+  bottom: "conv1"
+  top: "conv2/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 64
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv2/dw/relu"
+  type: "ReLU"
+  bottom: "conv2/dw"
+  top: "conv2/dw"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "conv2/dw"
+  top: "conv2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv2/relu"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "conv3/dw"
+  type: "Convolution"
+  bottom: "conv2"
+  top: "conv3/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    group: 128
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv3/dw/relu"
+  type: "ReLU"
+  bottom: "conv3/dw"
+  top: "conv3/dw"
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "conv3/dw"
+  top: "conv3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv3/relu"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4/dw"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 128
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4/dw/relu"
+  type: "ReLU"
+  bottom: "conv4/dw"
+  top: "conv4/dw"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv4/dw"
+  top: "conv4"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4/relu"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5/dw"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 256
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv5/dw/relu"
+  type: "ReLU"
+  bottom: "conv5/dw"
+  top: "conv5/dw"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv5/dw"
+  top: "conv5"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv5/relu"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "conv6/dw"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "conv6/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 256
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6/dw/relu"
+  type: "ReLU"
+  bottom: "conv6/dw"
+  top: "conv6/dw"
+}
+layer {
+  name: "conv6"
+  type: "Convolution"
+  bottom: "conv6/dw"
+  top: "conv6"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6/relu"
+  type: "ReLU"
+  bottom: "conv6"
+  top: "conv6"
+}
+layer {
+  name: "conv7/dw"
+  type: "Convolution"
+  bottom: "conv6"
+  top: "conv7/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7/dw/relu"
+  type: "ReLU"
+  bottom: "conv7/dw"
+  top: "conv7/dw"
+}
+layer {
+  name: "conv7"
+  type: "Convolution"
+  bottom: "conv7/dw"
+  top: "conv7"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7/relu"
+  type: "ReLU"
+  bottom: "conv7"
+  top: "conv7"
+}
+layer {
+  name: "conv8/dw"
+  type: "Convolution"
+  bottom: "conv7"
+  top: "conv8/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8/dw/relu"
+  type: "ReLU"
+  bottom: "conv8/dw"
+  top: "conv8/dw"
+}
+layer {
+  name: "conv8"
+  type: "Convolution"
+  bottom: "conv8/dw"
+  top: "conv8"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8/relu"
+  type: "ReLU"
+  bottom: "conv8"
+  top: "conv8"
+}
+layer {
+  name: "conv9/dw"
+  type: "Convolution"
+  bottom: "conv8"
+  top: "conv9/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9/dw/relu"
+  type: "ReLU"
+  bottom: "conv9/dw"
+  top: "conv9/dw"
+}
+layer {
+  name: "conv9"
+  type: "Convolution"
+  bottom: "conv9/dw"
+  top: "conv9"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9/relu"
+  type: "ReLU"
+  bottom: "conv9"
+  top: "conv9"
+}
+layer {
+  name: "conv10/dw"
+  type: "Convolution"
+  bottom: "conv9"
+  top: "conv10/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv10/dw/relu"
+  type: "ReLU"
+  bottom: "conv10/dw"
+  top: "conv10/dw"
+}
+layer {
+  name: "conv10"
+  type: "Convolution"
+  bottom: "conv10/dw"
+  top: "conv10"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv10/relu"
+  type: "ReLU"
+  bottom: "conv10"
+  top: "conv10"
+}
+layer {
+  name: "conv11/dw"
+  type: "Convolution"
+  bottom: "conv10"
+  top: "conv11/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11/dw/relu"
+  type: "ReLU"
+  bottom: "conv11/dw"
+  top: "conv11/dw"
+}
+layer {
+  name: "conv11"
+  type: "Convolution"
+  bottom: "conv11/dw"
+  top: "conv11"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11/relu"
+  type: "ReLU"
+  bottom: "conv11"
+  top: "conv11"
+}
+layer {
+  name: "conv12/dw"
+  type: "Convolution"
+  bottom: "conv11"
+  top: "conv12/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    group: 512
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv12/dw/relu"
+  type: "ReLU"
+  bottom: "conv12/dw"
+  top: "conv12/dw"
+}
+layer {
+  name: "conv12"
+  type: "Convolution"
+  bottom: "conv12/dw"
+  top: "conv12"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv12/relu"
+  type: "ReLU"
+  bottom: "conv12"
+  top: "conv12"
+}
+layer {
+  name: "conv13/dw"
+  type: "Convolution"
+  bottom: "conv12"
+  top: "conv13/dw"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 1
+    kernel_size: 3
+    group: 1024
+    engine: CAFFE
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13/dw/relu"
+  type: "ReLU"
+  bottom: "conv13/dw"
+  top: "conv13/dw"
+}
+layer {
+  name: "conv13"
+  type: "Convolution"
+  bottom: "conv13/dw"
+  top: "conv13"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13/relu"
+  type: "ReLU"
+  bottom: "conv13"
+  top: "conv13"
+}
+layer {
+  name: "conv14_1"
+  type: "Convolution"
+  bottom: "conv13"
+  top: "conv14_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_1/relu"
+  type: "ReLU"
+  bottom: "conv14_1"
+  top: "conv14_1"
+}
+layer {
+  name: "conv14_2"
+  type: "Convolution"
+  bottom: "conv14_1"
+  top: "conv14_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_2/relu"
+  type: "ReLU"
+  bottom: "conv14_2"
+  top: "conv14_2"
+}
+layer {
+  name: "conv15_1"
+  type: "Convolution"
+  bottom: "conv14_2"
+  top: "conv15_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_1/relu"
+  type: "ReLU"
+  bottom: "conv15_1"
+  top: "conv15_1"
+}
+layer {
+  name: "conv15_2"
+  type: "Convolution"
+  bottom: "conv15_1"
+  top: "conv15_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_2/relu"
+  type: "ReLU"
+  bottom: "conv15_2"
+  top: "conv15_2"
+}
+layer {
+  name: "conv16_1"
+  type: "Convolution"
+  bottom: "conv15_2"
+  top: "conv16_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_1/relu"
+  type: "ReLU"
+  bottom: "conv16_1"
+  top: "conv16_1"
+}
+layer {
+  name: "conv16_2"
+  type: "Convolution"
+  bottom: "conv16_1"
+  top: "conv16_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_2/relu"
+  type: "ReLU"
+  bottom: "conv16_2"
+  top: "conv16_2"
+}
+layer {
+  name: "conv17_1"
+  type: "Convolution"
+  bottom: "conv16_2"
+  top: "conv17_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_1/relu"
+  type: "ReLU"
+  bottom: "conv17_1"
+  top: "conv17_1"
+}
+layer {
+  name: "conv17_2"
+  type: "Convolution"
+  bottom: "conv17_1"
+  top: "conv17_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_2/relu"
+  type: "ReLU"
+  bottom: "conv17_2"
+  top: "conv17_2"
+}
+layer {
+  name: "conv11_mbox_loc"
+  type: "Convolution"
+  bottom: "conv11"
+  top: "conv11_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 12
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv11_mbox_loc"
+  top: "conv11_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv11_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv11_mbox_loc_perm"
+  top: "conv11_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv11_mbox_conf"
+  type: "Convolution"
+  bottom: "conv11"
+  top: "conv11_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 63
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv11_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv11_mbox_conf"
+  top: "conv11_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv11_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv11_mbox_conf_perm"
+  top: "conv11_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv11_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv11"
+  bottom: "data"
+  top: "conv11_mbox_priorbox"
+  prior_box_param {
+    min_size: 60.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv13_mbox_loc"
+  type: "Convolution"
+  bottom: "conv13"
+  top: "conv13_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv13_mbox_loc"
+  top: "conv13_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv13_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv13_mbox_loc_perm"
+  top: "conv13_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv13_mbox_conf"
+  type: "Convolution"
+  bottom: "conv13"
+  top: "conv13_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv13_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv13_mbox_conf"
+  top: "conv13_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv13_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv13_mbox_conf_perm"
+  top: "conv13_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv13_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv13"
+  bottom: "data"
+  top: "conv13_mbox_priorbox"
+  prior_box_param {
+    min_size: 105.0
+    max_size: 150.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv14_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv14_2"
+  top: "conv14_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv14_2_mbox_loc"
+  top: "conv14_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv14_2_mbox_loc_perm"
+  top: "conv14_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv14_2"
+  top: "conv14_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv14_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv14_2_mbox_conf"
+  top: "conv14_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv14_2_mbox_conf_perm"
+  top: "conv14_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv14_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv14_2"
+  bottom: "data"
+  top: "conv14_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 150.0
+    max_size: 195.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv15_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv15_2"
+  top: "conv15_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv15_2_mbox_loc"
+  top: "conv15_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv15_2_mbox_loc_perm"
+  top: "conv15_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv15_2"
+  top: "conv15_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv15_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv15_2_mbox_conf"
+  top: "conv15_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv15_2_mbox_conf_perm"
+  top: "conv15_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv15_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv15_2"
+  bottom: "data"
+  top: "conv15_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 195.0
+    max_size: 240.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv16_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv16_2"
+  top: "conv16_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv16_2_mbox_loc"
+  top: "conv16_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv16_2_mbox_loc_perm"
+  top: "conv16_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv16_2"
+  top: "conv16_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv16_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv16_2_mbox_conf"
+  top: "conv16_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv16_2_mbox_conf_perm"
+  top: "conv16_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv16_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv16_2"
+  bottom: "data"
+  top: "conv16_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 240.0
+    max_size: 285.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv17_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv17_2"
+  top: "conv17_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv17_2_mbox_loc"
+  top: "conv17_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv17_2_mbox_loc_perm"
+  top: "conv17_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv17_2"
+  top: "conv17_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 126
+    kernel_size: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv17_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv17_2_mbox_conf"
+  top: "conv17_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv17_2_mbox_conf_perm"
+  top: "conv17_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv17_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv17_2"
+  bottom: "data"
+  top: "conv17_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 285.0
+    max_size: 300.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv11_mbox_loc_flat"
+  bottom: "conv13_mbox_loc_flat"
+  bottom: "conv14_2_mbox_loc_flat"
+  bottom: "conv15_2_mbox_loc_flat"
+  bottom: "conv16_2_mbox_loc_flat"
+  bottom: "conv17_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv11_mbox_conf_flat"
+  bottom: "conv13_mbox_conf_flat"
+  bottom: "conv14_2_mbox_conf_flat"
+  bottom: "conv15_2_mbox_conf_flat"
+  bottom: "conv16_2_mbox_conf_flat"
+  bottom: "conv17_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv11_mbox_priorbox"
+  bottom: "conv13_mbox_priorbox"
+  bottom: "conv14_2_mbox_priorbox"
+  bottom: "conv15_2_mbox_priorbox"
+  bottom: "conv16_2_mbox_priorbox"
+  bottom: "conv17_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 21
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 21
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.45
+      top_k: 100
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 100
+    confidence_threshold: 0.25
+  }
+}

roi/pooler.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from enum import Enum
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+# from support.layer.roi_align import ROIAlign
+from torchvision.ops import RoIAlign as ROIAlign
+class Pooler(object):
+    class Mode(Enum):
+        POOLING = 'pooling'
+        ALIGN = 'align'
+    OPTIONS = ['pooling', 'align']
+    @staticmethod
+    def apply(features: Tensor, proposal_bboxes: Tensor, proposal_batch_indices: Tensor, mode: Mode) -> Tensor:
+        _, _, feature_map_height, feature_map_width = features.shape
+        scale = 1 / 16
+        output_size = (7 * 2, 7 * 2)
+        if mode == Pooler.Mode.POOLING:
+            pool = []
+            for (proposal_bbox, proposal_batch_index) in zip(proposal_bboxes, proposal_batch_indices):
+                start_x = max(min(round(proposal_bbox[0].item() * scale), feature_map_width - 1), 0)      # [0, feature_map_width)
+                start_y = max(min(round(proposal_bbox[1].item() * scale), feature_map_height - 1), 0)     # (0, feature_map_height]
+                end_x = max(min(round(proposal_bbox[2].item() * scale) + 1, feature_map_width), 1)        # [0, feature_map_width)
+                end_y = max(min(round(proposal_bbox[3].item() * scale) + 1, feature_map_height), 1)       # (0, feature_map_height]
+                roi_feature_map = features[proposal_batch_index, :, start_y:end_y, start_x:end_x]
+                pool.append(F.adaptive_max_pool2d(input=roi_feature_map, output_size=output_size))
+            pool = torch.stack(pool, dim=0)
+        elif mode == Pooler.Mode.ALIGN:
+            pool = ROIAlign(output_size, spatial_scale=scale, sampling_ratio=0)(
+                features,
+                torch.cat([proposal_batch_indices.view(-1, 1).float(), proposal_bboxes], dim=1)
+            )
+        else:
+            raise ValueError
+        pool = F.max_pool2d(input=pool, kernel_size=2, stride=2)
+        return pool

rpn/region_proposal_network.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from typing import Tuple, List, Optional, Union
+import numpy as np
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from bbox import BBox
+from extension.functional import beta_smooth_l1_loss
+from torchvision.ops import nms
+class RegionProposalNetwork(nn.Module):
+    def __init__(self, num_features_out: int, anchor_ratios: List[Tuple[int, int]], anchor_sizes: List[int],
+                 pre_nms_top_n: int, post_nms_top_n: int, anchor_smooth_l1_loss_beta: float):
+        super().__init__()
+        self._features = nn.Sequential(
+            nn.Conv2d(in_channels=num_features_out, out_channels=512, kernel_size=3, padding=1),
+            nn.ReLU()
+        )
+        self._anchor_ratios = anchor_ratios
+        self._anchor_sizes = anchor_sizes
+        num_anchor_ratios = len(self._anchor_ratios)
+        num_anchor_sizes = len(self._anchor_sizes)
+        num_anchors = num_anchor_ratios * num_anchor_sizes
+        self._pre_nms_top_n = pre_nms_top_n
+        self._post_nms_top_n = post_nms_top_n
+        self._anchor_smooth_l1_loss_beta = anchor_smooth_l1_loss_beta
+        self._anchor_objectness = nn.Conv2d(in_channels=512, out_channels=num_anchors * 2, kernel_size=1)
+        self._anchor_transformer = nn.Conv2d(in_channels=512, out_channels=num_anchors * 4, kernel_size=1)
+    def forward(self, features: Tensor,
+                anchor_bboxes: Optional[Tensor] = None, gt_bboxes_batch: Optional[Tensor] = None,
+                image_width: Optional[int]=None, image_height: Optional[int]=None) -> Union[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor]]:
+        batch_size = features.shape[0]
+        features = self._features(features)
+        anchor_objectnesses = self._anchor_objectness(features)
+        anchor_transformers = self._anchor_transformer(features)
+        anchor_objectnesses = anchor_objectnesses.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)
+        anchor_transformers = anchor_transformers.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
+        if not self.training:
+            return anchor_objectnesses, anchor_transformers
+        else:
+            # remove cross-boundary
+            # NOTE: The length of `inside_indices` is guaranteed to be a multiple of `anchor_bboxes.shape[0]` as each batch in `anchor_bboxes` is the same
+            inside_indices = BBox.inside(anchor_bboxes, left=0, top=0, right=image_width, bottom=image_height).nonzero().unbind(dim=1)
+            inside_anchor_bboxes = anchor_bboxes[inside_indices].view(batch_size, -1, anchor_bboxes.shape[2])
+            inside_anchor_objectnesses = anchor_objectnesses[inside_indices].view(batch_size, -1, anchor_objectnesses.shape[2])
+            inside_anchor_transformers = anchor_transformers[inside_indices].view(batch_size, -1, anchor_transformers.shape[2])
+            # find labels for each `anchor_bboxes`
+            labels = torch.full((batch_size, inside_anchor_bboxes.shape[1]), -1, dtype=torch.long, device=inside_anchor_bboxes.device)
+            ious = BBox.iou(inside_anchor_bboxes, gt_bboxes_batch)
+            anchor_max_ious, anchor_assignments = ious.max(dim=2)
+            gt_max_ious, gt_assignments = ious.max(dim=1)
+            anchor_additions = ((ious > 0) & (ious == gt_max_ious.unsqueeze(dim=1))).nonzero()[:, :2].unbind(dim=1)
+            labels[anchor_max_ious < 0.3] = 0
+            labels[anchor_additions] = 1
+            labels[anchor_max_ious >= 0.7] = 1
+            # select 256 x `batch_size` samples
+            fg_indices = (labels == 1).nonzero()
+            bg_indices = (labels == 0).nonzero()
+            fg_indices = fg_indices[torch.randperm(len(fg_indices))[:min(len(fg_indices), 256 * batch_size)]]
+            bg_indices = bg_indices[torch.randperm(len(bg_indices))[:256 * batch_size - len(fg_indices)]]
+            selected_indices = torch.cat([fg_indices, bg_indices], dim=0)
+            selected_indices = selected_indices[torch.randperm(len(selected_indices))].unbind(dim=1)
+            inside_anchor_bboxes = inside_anchor_bboxes[selected_indices]
+            gt_bboxes = gt_bboxes_batch[selected_indices[0], anchor_assignments[selected_indices]]
+            gt_anchor_objectnesses = labels[selected_indices]
+            gt_anchor_transformers = BBox.calc_transformer(inside_anchor_bboxes, gt_bboxes)
+            batch_indices = selected_indices[0]
+            anchor_objectness_losses, anchor_transformer_losses = self.loss(inside_anchor_objectnesses[selected_indices],
+                                                                            inside_anchor_transformers[selected_indices],
+                                                                            gt_anchor_objectnesses,
+                                                                            gt_anchor_transformers,
+                                                                            batch_size, batch_indices)
+            return anchor_objectnesses, anchor_transformers, anchor_objectness_losses, anchor_transformer_losses
+    def loss(self, anchor_objectnesses: Tensor, anchor_transformers: Tensor,
+             gt_anchor_objectnesses: Tensor, gt_anchor_transformers: Tensor,
+             batch_size: int, batch_indices: Tensor) -> Tuple[Tensor, Tensor]:
+        cross_entropies = torch.empty(batch_size, dtype=torch.float, device=anchor_objectnesses.device)
+        smooth_l1_losses = torch.empty(batch_size, dtype=torch.float, device=anchor_transformers.device)
+        for batch_index in range(batch_size):
+            selected_indices = (batch_indices == batch_index).nonzero().view(-1)
+            cross_entropy = F.cross_entropy(input=anchor_objectnesses[selected_indices],
+                                            target=gt_anchor_objectnesses[selected_indices])
+            fg_indices = gt_anchor_objectnesses[selected_indices].nonzero().view(-1)
+            smooth_l1_loss = beta_smooth_l1_loss(input=anchor_transformers[selected_indices][fg_indices],
+                                                 target=gt_anchor_transformers[selected_indices][fg_indices],
+                                                 beta=self._anchor_smooth_l1_loss_beta)
+            cross_entropies[batch_index] = cross_entropy
+            smooth_l1_losses[batch_index] = smooth_l1_loss
+        return cross_entropies, smooth_l1_losses
+    def generate_anchors(self, image_width: int, image_height: int, num_x_anchors: int, num_y_anchors: int) -> Tensor:
+        center_ys = np.linspace(start=0, stop=image_height, num=num_y_anchors + 2)[1:-1]
+        center_xs = np.linspace(start=0, stop=image_width, num=num_x_anchors + 2)[1:-1]
+        ratios = np.array(self._anchor_ratios)
+        ratios = ratios[:, 0] / ratios[:, 1]
+        sizes = np.array(self._anchor_sizes)
+        # NOTE: it's important to let `center_ys` be the major index (i.e., move horizontally and then vertically) for consistency with 2D convolution
+        # giving the string 'ij' returns a meshgrid with matrix indexing, i.e., with shape (#center_ys, #center_xs, #ratios)
+        center_ys, center_xs, ratios, sizes = np.meshgrid(center_ys, center_xs, ratios, sizes, indexing='ij')
+        center_ys = center_ys.reshape(-1)
+        center_xs = center_xs.reshape(-1)
+        ratios = ratios.reshape(-1)
+        sizes = sizes.reshape(-1)
+        widths = sizes * np.sqrt(1 / ratios)
+        heights = sizes * np.sqrt(ratios)
+        center_based_anchor_bboxes = np.stack((center_xs, center_ys, widths, heights), axis=1)
+        center_based_anchor_bboxes = torch.from_numpy(center_based_anchor_bboxes).float()
+        anchor_bboxes = BBox.from_center_base(center_based_anchor_bboxes)
+        return anchor_bboxes
+    def generate_proposals(self, anchor_bboxes: Tensor, objectnesses: Tensor, transformers: Tensor, image_width: int, image_height: int) -> Tensor:
+        batch_size = anchor_bboxes.shape[0]
+        proposal_bboxes = BBox.apply_transformer(anchor_bboxes, transformers)
+        proposal_bboxes = BBox.clip(proposal_bboxes, left=0, top=0, right=image_width, bottom=image_height)
+        proposal_probs = F.softmax(objectnesses[:, :, 1], dim=-1)
+        _, sorted_indices = torch.sort(proposal_probs, dim=-1, descending=True)
+        nms_proposal_bboxes_batch = []
+        for batch_index in range(batch_size):
+            sorted_bboxes = proposal_bboxes[batch_index][sorted_indices[batch_index]][:self._pre_nms_top_n]
+            sorted_probs = proposal_probs[batch_index][sorted_indices[batch_index]][:self._pre_nms_top_n]
+            threshold = 0.7
+            kept_indices = nms(sorted_bboxes, sorted_probs, threshold)
+            nms_bboxes = sorted_bboxes[kept_indices][:self._post_nms_top_n]
+            nms_proposal_bboxes_batch.append(nms_bboxes)
+        max_nms_proposal_bboxes_length = max([len(it) for it in nms_proposal_bboxes_batch])
+        padded_proposal_bboxes = []
+        for nms_proposal_bboxes in nms_proposal_bboxes_batch:
+            padded_proposal_bboxes.append(
+                torch.cat([
+                    nms_proposal_bboxes,
+                    torch.zeros(max_nms_proposal_bboxes_length - len(nms_proposal_bboxes), 4).to(nms_proposal_bboxes)
+                ])
+            )
+        padded_proposal_bboxes = torch.stack(padded_proposal_bboxes, dim=0)
+        return padded_proposal_bboxes