sadimanna commited on
Commit
d6def08
·
1 Parent(s): 451fc17

Upload 20 files

Browse files
backbone/base.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, Type
2
+
3
+ from torch import nn
4
+
5
+
6
+ class Base(object):
7
+
8
+ OPTIONS = ['resnet18', 'resnet50', 'resnet101']
9
+
10
+ @staticmethod
11
+ def from_name(name: str) -> Type['Base']:
12
+ if name == 'resnet18':
13
+ from backbone.resnet18 import ResNet18
14
+ return ResNet18
15
+ elif name == 'resnet50':
16
+ from backbone.resnet50 import ResNet50
17
+ return ResNet50
18
+ elif name == 'resnet101':
19
+ from backbone.resnet101 import ResNet101
20
+ return ResNet101
21
+ else:
22
+ raise ValueError
23
+
24
+ def __init__(self, pretrained: bool):
25
+ super().__init__()
26
+ self._pretrained = pretrained
27
+
28
+ def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
29
+ raise NotImplementedError
backbone/resnet101.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torchvision
4
+ from torch import nn
5
+
6
+ import backbone.base
7
+
8
+
9
+ class ResNet101(backbone.base.Base):
10
+
11
+ def __init__(self, pretrained: bool):
12
+ super().__init__(pretrained)
13
+
14
+ def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
15
+ resnet101 = torchvision.models.resnet101(pretrained=self._pretrained)
16
+
17
+ # list(resnet101.children()) consists of following modules
18
+ # [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU,
19
+ # [3] = MaxPool2d, [4] = Sequential(Bottleneck...),
20
+ # [5] = Sequential(Bottleneck...),
21
+ # [6] = Sequential(Bottleneck...),
22
+ # [7] = Sequential(Bottleneck...),
23
+ # [8] = AvgPool2d, [9] = Linear
24
+ children = list(resnet101.children())
25
+ features = children[:-3]
26
+ num_features_out = 1024
27
+
28
+ hidden = children[-3]
29
+ num_hidden_out = 2048
30
+
31
+ for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]:
32
+ for parameter in parameters:
33
+ parameter.requires_grad = False
34
+
35
+ features = nn.Sequential(*features)
36
+
37
+ return features, hidden, num_features_out, num_hidden_out
backbone/resnet18.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torchvision
4
+ from torch import nn
5
+
6
+ import backbone.base
7
+
8
+
9
+ class ResNet18(backbone.base.Base):
10
+
11
+ def __init__(self, pretrained: bool):
12
+ super().__init__(pretrained)
13
+
14
+ def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
15
+ resnet18 = torchvision.models.resnet18(pretrained=self._pretrained)
16
+
17
+ # list(resnet18.children()) consists of following modules
18
+ # [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU,
19
+ # [3] = MaxPool2d, [4] = Sequential(Bottleneck...),
20
+ # [5] = Sequential(Bottleneck...),
21
+ # [6] = Sequential(Bottleneck...),
22
+ # [7] = Sequential(Bottleneck...),
23
+ # [8] = AvgPool2d, [9] = Linear
24
+ children = list(resnet18.children())
25
+ features = children[:-3]
26
+ num_features_out = 256
27
+
28
+ hidden = children[-3]
29
+ num_hidden_out = 512
30
+
31
+ for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]:
32
+ for parameter in parameters:
33
+ parameter.requires_grad = False
34
+
35
+ features = nn.Sequential(*features)
36
+
37
+ return features, hidden, num_features_out, num_hidden_out
backbone/resnet50.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torchvision
4
+ from torch import nn
5
+
6
+ import backbone.base
7
+
8
+
9
+ class ResNet50(backbone.base.Base):
10
+
11
+ def __init__(self, pretrained: bool):
12
+ super().__init__(pretrained)
13
+
14
+ def features(self) -> Tuple[nn.Module, nn.Module, int, int]:
15
+ resnet50 = torchvision.models.resnet50(pretrained=self._pretrained)
16
+
17
+ # list(resnet50.children()) consists of following modules
18
+ # [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU,
19
+ # [3] = MaxPool2d, [4] = Sequential(Bottleneck...),
20
+ # [5] = Sequential(Bottleneck...),
21
+ # [6] = Sequential(Bottleneck...),
22
+ # [7] = Sequential(Bottleneck...),
23
+ # [8] = AvgPool2d, [9] = Linear
24
+ children = list(resnet50.children())
25
+ features = children[:-3]
26
+ num_features_out = 1024
27
+
28
+ hidden = children[-3]
29
+ num_hidden_out = 2048
30
+
31
+ for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]:
32
+ for parameter in parameters:
33
+ parameter.requires_grad = False
34
+
35
+ features = nn.Sequential(*features)
36
+
37
+ return features, hidden, num_features_out, num_hidden_out
config/config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ from typing import Tuple, List
3
+
4
+ from roi.pooler import Pooler
5
+
6
+
7
+ class Config(object):
8
+
9
+ IMAGE_MIN_SIDE: float = 600.0
10
+ IMAGE_MAX_SIDE: float = 1000.0
11
+
12
+ ANCHOR_RATIOS: List[Tuple[int, int]] = [(1, 2), (1, 1), (2, 1)]
13
+ ANCHOR_SIZES: List[int] = [128, 256, 512]
14
+ POOLER_MODE: Pooler.Mode = Pooler.Mode.POOLING
15
+
16
+ @classmethod
17
+ def describe(cls):
18
+ text = '\nConfig:\n'
19
+ attrs = [attr for attr in dir(cls) if not callable(getattr(cls, attr)) and not attr.startswith('__')]
20
+ text += '\n'.join(['\t{:s} = {:s}'.format(attr, str(getattr(cls, attr))) for attr in attrs]) + '\n'
21
+
22
+ return text
23
+
24
+ @classmethod
25
+ def setup(cls, image_min_side: float = None, image_max_side: float = None,
26
+ anchor_ratios: List[Tuple[int, int]] = None, anchor_sizes: List[int] = None, pooler_mode: str = None):
27
+ if image_min_side is not None:
28
+ cls.IMAGE_MIN_SIDE = image_min_side
29
+ if image_max_side is not None:
30
+ cls.IMAGE_MAX_SIDE = image_max_side
31
+
32
+ if anchor_ratios is not None:
33
+ cls.ANCHOR_RATIOS = ast.literal_eval(anchor_ratios)
34
+ if anchor_sizes is not None:
35
+ cls.ANCHOR_SIZES = ast.literal_eval(anchor_sizes)
36
+ if pooler_mode is not None:
37
+ cls.POOLER_MODE = Pooler.Mode(pooler_mode)
config/eval_config.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ from config.config import Config
4
+
5
+
6
+ class EvalConfig(Config):
7
+
8
+ RPN_PRE_NMS_TOP_N: int = 6000
9
+ RPN_POST_NMS_TOP_N: int = 300
10
+
11
+ @classmethod
12
+ def setup(cls, image_min_side: float = None, image_max_side: float = None,
13
+ anchor_ratios: List[Tuple[int, int]] = None, anchor_sizes: List[int] = None, pooler_mode: str = None,
14
+ rpn_pre_nms_top_n: int = None, rpn_post_nms_top_n: int = None):
15
+ super().setup(image_min_side, image_max_side, anchor_ratios, anchor_sizes, pooler_mode)
16
+
17
+ if rpn_pre_nms_top_n is not None:
18
+ cls.RPN_PRE_NMS_TOP_N = rpn_pre_nms_top_n
19
+ if rpn_post_nms_top_n is not None:
20
+ cls.RPN_POST_NMS_TOP_N = rpn_post_nms_top_n
config/train_config.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ from typing import List, Tuple
3
+
4
+ from config.config import Config
5
+
6
+
7
+ class TrainConfig(Config):
8
+
9
+ RPN_PRE_NMS_TOP_N: int = 12000
10
+ RPN_POST_NMS_TOP_N: int = 2000
11
+
12
+ ANCHOR_SMOOTH_L1_LOSS_BETA: float = 1.0
13
+ PROPOSAL_SMOOTH_L1_LOSS_BETA: float = 1.0
14
+
15
+ BATCH_SIZE: int = 1
16
+ LEARNING_RATE: float = 0.001
17
+ MOMENTUM: float = 0.9
18
+ WEIGHT_DECAY: float = 0.0005
19
+ STEP_LR_SIZES: List[int] = [50000, 70000]
20
+ STEP_LR_GAMMA: float = 0.1
21
+ WARM_UP_FACTOR: float = 0.3333
22
+ WARM_UP_NUM_ITERS: int = 500
23
+
24
+ NUM_STEPS_TO_DISPLAY: int = 20
25
+ NUM_STEPS_TO_SNAPSHOT: int = 10000
26
+ NUM_STEPS_TO_FINISH: int = 90000
27
+
28
+ @classmethod
29
+ def setup(cls, image_min_side: float = None, image_max_side: float = None,
30
+ anchor_ratios: List[Tuple[int, int]] = None, anchor_sizes: List[int] = None, pooler_mode: str = None,
31
+ rpn_pre_nms_top_n: int = None, rpn_post_nms_top_n: int = None,
32
+ anchor_smooth_l1_loss_beta: float = None, proposal_smooth_l1_loss_beta: float = None,
33
+ batch_size: int = None, learning_rate: float = None, momentum: float = None, weight_decay: float = None,
34
+ step_lr_sizes: List[int] = None, step_lr_gamma: float = None,
35
+ warm_up_factor: float = None, warm_up_num_iters: int = None,
36
+ num_steps_to_display: int = None, num_steps_to_snapshot: int = None, num_steps_to_finish: int = None):
37
+ super().setup(image_min_side, image_max_side, anchor_ratios, anchor_sizes, pooler_mode)
38
+
39
+ if rpn_pre_nms_top_n is not None:
40
+ cls.RPN_PRE_NMS_TOP_N = rpn_pre_nms_top_n
41
+ if rpn_post_nms_top_n is not None:
42
+ cls.RPN_POST_NMS_TOP_N = rpn_post_nms_top_n
43
+
44
+ if anchor_smooth_l1_loss_beta is not None:
45
+ cls.ANCHOR_SMOOTH_L1_LOSS_BETA = anchor_smooth_l1_loss_beta
46
+ if proposal_smooth_l1_loss_beta is not None:
47
+ cls.PROPOSAL_SMOOTH_L1_LOSS_BETA = proposal_smooth_l1_loss_beta
48
+
49
+ if batch_size is not None:
50
+ cls.BATCH_SIZE = batch_size
51
+ if learning_rate is not None:
52
+ cls.LEARNING_RATE = learning_rate
53
+ if momentum is not None:
54
+ cls.MOMENTUM = momentum
55
+ if weight_decay is not None:
56
+ cls.WEIGHT_DECAY = weight_decay
57
+ if step_lr_sizes is not None:
58
+ cls.STEP_LR_SIZES = ast.literal_eval(step_lr_sizes)
59
+ if step_lr_gamma is not None:
60
+ cls.STEP_LR_GAMMA = step_lr_gamma
61
+ if warm_up_factor is not None:
62
+ cls.WARM_UP_FACTOR = warm_up_factor
63
+ if warm_up_num_iters is not None:
64
+ cls.WARM_UP_NUM_ITERS = warm_up_num_iters
65
+
66
+ if num_steps_to_display is not None:
67
+ cls.NUM_STEPS_TO_DISPLAY = num_steps_to_display
68
+ if num_steps_to_snapshot is not None:
69
+ cls.NUM_STEPS_TO_SNAPSHOT = num_steps_to_snapshot
70
+ if num_steps_to_finish is not None:
71
+ cls.NUM_STEPS_TO_FINISH = num_steps_to_finish
dataset/base.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from enum import Enum
3
+ from typing import Tuple, List, Type, Iterator
4
+
5
+ import PIL
6
+ import torch.utils.data.dataset
7
+ import torch.utils.data.sampler
8
+ from PIL import Image
9
+ from torch import Tensor
10
+ from torch.nn import functional as F
11
+ from torchvision.transforms import transforms
12
+
13
+
14
+ class Base(torch.utils.data.dataset.Dataset):
15
+
16
+ class Mode(Enum):
17
+ TRAIN = 'train'
18
+ EVAL = 'eval'
19
+
20
+ OPTIONS = ['voc2007', 'coco2017', 'voc2007-cat-dog', 'coco2017-person', 'coco2017-car', 'coco2017-animal']
21
+
22
+ @staticmethod
23
+ def from_name(name: str) -> Type['Base']:
24
+ if name == 'voc2007':
25
+ from dataset.voc2007 import VOC2007
26
+ return VOC2007
27
+ elif name == 'coco2017':
28
+ from dataset.coco2017 import COCO2017
29
+ return COCO2017
30
+ elif name == 'voc2007-cat-dog':
31
+ from dataset.voc2007_cat_dog import VOC2007CatDog
32
+ return VOC2007CatDog
33
+ elif name == 'coco2017-person':
34
+ from dataset.coco2017_person import COCO2017Person
35
+ return COCO2017Person
36
+ elif name == 'coco2017-car':
37
+ from dataset.coco2017_car import COCO2017Car
38
+ return COCO2017Car
39
+ elif name == 'coco2017-animal':
40
+ from dataset.coco2017_animal import COCO2017Animal
41
+ return COCO2017Animal
42
+ else:
43
+ raise ValueError
44
+
45
+ def __init__(self, path_to_data_dir: str, mode: Mode, image_min_side: float, image_max_side: float):
46
+ self._path_to_data_dir = path_to_data_dir
47
+ self._mode = mode
48
+ self._image_min_side = image_min_side
49
+ self._image_max_side = image_max_side
50
+
51
+ def __len__(self) -> int:
52
+ raise NotImplementedError
53
+
54
+ def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
55
+ raise NotImplementedError
56
+
57
+ def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
58
+ raise NotImplementedError
59
+
60
+ def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
61
+ raise NotImplementedError
62
+
63
+ @property
64
+ def image_ratios(self) -> List[float]:
65
+ raise NotImplementedError
66
+
67
+ @staticmethod
68
+ def num_classes() -> int:
69
+ raise NotImplementedError
70
+
71
+ @staticmethod
72
+ def preprocess(image: PIL.Image.Image, image_min_side: float, image_max_side: float) -> Tuple[Tensor, float]:
73
+ # resize according to the rules:
74
+ # 1. scale shorter side to IMAGE_MIN_SIDE
75
+ # 2. after scaling, if longer side > IMAGE_MAX_SIDE, scale longer side to IMAGE_MAX_SIDE
76
+ scale_for_shorter_side = image_min_side / min(image.width, image.height)
77
+ longer_side_after_scaling = max(image.width, image.height) * scale_for_shorter_side
78
+ scale_for_longer_side = (image_max_side / longer_side_after_scaling) if longer_side_after_scaling > image_max_side else 1
79
+ scale = scale_for_shorter_side * scale_for_longer_side
80
+
81
+ transform = transforms.Compose([
82
+ transforms.Resize((round(image.height * scale), round(image.width * scale))), # interpolation `BILINEAR` is applied by default
83
+ transforms.ToTensor(),
84
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
85
+ ])
86
+ image = transform(image)
87
+
88
+ return image, scale
89
+
90
+ @staticmethod
91
+ def padding_collate_fn(batch: List[Tuple[str, Tensor, Tensor, Tensor, Tensor]]) -> Tuple[List[str], Tensor, Tensor, Tensor, Tensor]:
92
+ image_id_batch, image_batch, scale_batch, bboxes_batch, labels_batch = zip(*batch)
93
+
94
+ max_image_width = max([it.shape[2] for it in image_batch])
95
+ max_image_height = max([it.shape[1] for it in image_batch])
96
+ max_bboxes_length = max([len(it) for it in bboxes_batch])
97
+ max_labels_length = max([len(it) for it in labels_batch])
98
+
99
+ padded_image_batch = []
100
+ padded_bboxes_batch = []
101
+ padded_labels_batch = []
102
+
103
+ for image in image_batch:
104
+ padded_image = F.pad(input=image, pad=(0, max_image_width - image.shape[2], 0, max_image_height - image.shape[1])) # pad has format (left, right, top, bottom)
105
+ padded_image_batch.append(padded_image)
106
+
107
+ for bboxes in bboxes_batch:
108
+ padded_bboxes = torch.cat([bboxes, torch.zeros(max_bboxes_length - len(bboxes), 4).to(bboxes)])
109
+ padded_bboxes_batch.append(padded_bboxes)
110
+
111
+ for labels in labels_batch:
112
+ padded_labels = torch.cat([labels, torch.zeros(max_labels_length - len(labels)).to(labels)])
113
+ padded_labels_batch.append(padded_labels)
114
+
115
+ image_id_batch = list(image_id_batch)
116
+ padded_image_batch = torch.stack(padded_image_batch, dim=0)
117
+ scale_batch = torch.stack(scale_batch, dim=0)
118
+ padded_bboxes_batch = torch.stack(padded_bboxes_batch, dim=0)
119
+ padded_labels_batch = torch.stack(padded_labels_batch, dim=0)
120
+
121
+ return image_id_batch, padded_image_batch, scale_batch, padded_bboxes_batch, padded_labels_batch
122
+
123
+ class NearestRatioRandomSampler(torch.utils.data.sampler.Sampler):
124
+
125
+ def __init__(self, image_ratios: List[float], num_neighbors: int):
126
+ super().__init__(data_source=None)
127
+ self._image_ratios = image_ratios
128
+ self._num_neighbors = num_neighbors
129
+
130
+ def __len__(self) -> int:
131
+ return len(self._image_ratios)
132
+
133
+ def __iter__(self) -> Iterator[int]:
134
+ image_ratios = torch.tensor(self._image_ratios)
135
+ tall_indices = (image_ratios < 1).nonzero().view(-1)
136
+ fat_indices = (image_ratios >= 1).nonzero().view(-1)
137
+
138
+ tall_indices_length = len(tall_indices)
139
+ fat_indices_length = len(fat_indices)
140
+
141
+ tall_indices = tall_indices[torch.randperm(tall_indices_length)]
142
+ fat_indices = fat_indices[torch.randperm(fat_indices_length)]
143
+
144
+ num_tall_remainder = tall_indices_length % self._num_neighbors
145
+ num_fat_remainder = fat_indices_length % self._num_neighbors
146
+
147
+ tall_indices = tall_indices[:tall_indices_length - num_tall_remainder]
148
+ fat_indices = fat_indices[:fat_indices_length - num_fat_remainder]
149
+
150
+ tall_indices = tall_indices.view(-1, self._num_neighbors)
151
+ fat_indices = fat_indices.view(-1, self._num_neighbors)
152
+ merge_indices = torch.cat([tall_indices, fat_indices], dim=0)
153
+ merge_indices = merge_indices[torch.randperm(len(merge_indices))].view(-1)
154
+
155
+ return iter(merge_indices.tolist())
dataset/coco2017.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pickle
4
+ import random
5
+ from typing import List, Tuple, Dict
6
+
7
+ import torch
8
+ import torch.utils.data.dataset
9
+ from PIL import Image, ImageOps
10
+ from pycocotools.coco import COCO
11
+ from pycocotools.cocoeval import COCOeval
12
+ from torch import Tensor
13
+ from torchvision.datasets import CocoDetection
14
+ from tqdm import tqdm
15
+
16
+ from bbox import BBox
17
+ from dataset.base import Base
18
+ from io import StringIO
19
+ import sys
20
+
21
+
22
+ class COCO2017(Base):
23
+
24
+ class Annotation(object):
25
+ class Object(object):
26
+ def __init__(self, bbox: BBox, label: int):
27
+ super().__init__()
28
+ self.bbox = bbox
29
+ self.label = label
30
+
31
+ def __repr__(self) -> str:
32
+ return 'Object[label={:d}, bbox={!s}]'.format(
33
+ self.label, self.bbox)
34
+
35
+ def __init__(self, filename: str, objects: List[Object]):
36
+ super().__init__()
37
+ self.filename = filename
38
+ self.objects = objects
39
+
40
+ CATEGORY_TO_LABEL_DICT = {
41
+ 'background': 0, 'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4,
42
+ 'airplane': 5, 'bus': 6, 'train': 7, 'truck': 8, 'boat': 9,
43
+ 'traffic light': 10, 'fire hydrant': 11, 'street sign': 12, 'stop sign': 13, 'parking meter': 14,
44
+ 'bench': 15, 'bird': 16, 'cat': 17, 'dog': 18, 'horse': 19,
45
+ 'sheep': 20, 'cow': 21, 'elephant': 22, 'bear': 23, 'zebra': 24,
46
+ 'giraffe': 25, 'hat': 26, 'backpack': 27, 'umbrella': 28, 'shoe': 29,
47
+ 'eye glasses': 30, 'handbag': 31, 'tie': 32, 'suitcase': 33, 'frisbee': 34,
48
+ 'skis': 35, 'snowboard': 36, 'sports ball': 37, 'kite': 38, 'baseball bat': 39,
49
+ 'baseball glove': 40, 'skateboard': 41, 'surfboard': 42, 'tennis racket': 43, 'bottle': 44,
50
+ 'plate': 45, 'wine glass': 46, 'cup': 47, 'fork': 48, 'knife': 49,
51
+ 'spoon': 50, 'bowl': 51, 'banana': 52, 'apple': 53, 'sandwich': 54,
52
+ 'orange': 55, 'broccoli': 56, 'carrot': 57, 'hot dog': 58, 'pizza': 59,
53
+ 'donut': 60, 'cake': 61, 'chair': 62, 'couch': 63, 'potted plant': 64,
54
+ 'bed': 65, 'mirror': 66, 'dining table': 67, 'window': 68, 'desk': 69,
55
+ 'toilet': 70, 'door': 71, 'tv': 72, 'laptop': 73, 'mouse': 74,
56
+ 'remote': 75, 'keyboard': 76, 'cell phone': 77, 'microwave': 78, 'oven': 79,
57
+ 'toaster': 80, 'sink': 81, 'refrigerator': 82, 'blender': 83, 'book': 84,
58
+ 'clock': 85, 'vase': 86, 'scissors': 87, 'teddy bear': 88, 'hair drier': 89,
59
+ 'toothbrush': 90, 'hair brush': 91
60
+ }
61
+
62
+ LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
63
+
64
+ def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
65
+ super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
66
+
67
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
68
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
69
+ path_to_caches_dir = os.path.join('caches', 'coco2017', f'{self._mode.value}')
70
+ path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
71
+ path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
72
+ path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
73
+
74
+ if self._mode == COCO2017.Mode.TRAIN:
75
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
76
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
77
+ elif self._mode == COCO2017.Mode.EVAL:
78
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
79
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
80
+ else:
81
+ raise ValueError('invalid mode')
82
+
83
+ coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
84
+
85
+ if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
86
+ print('loading cache files...')
87
+
88
+ with open(path_to_image_ids_pickle, 'rb') as f:
89
+ self._image_ids = pickle.load(f)
90
+
91
+ with open(path_to_image_id_dict_pickle, 'rb') as f:
92
+ self._image_id_to_annotation_dict = pickle.load(f)
93
+
94
+ with open(path_to_image_ratios_pickle, 'rb') as f:
95
+ self._image_ratios = pickle.load(f)
96
+ else:
97
+ print('generating cache files...')
98
+
99
+ os.makedirs(path_to_caches_dir, exist_ok=True)
100
+
101
+ self._image_ids: List[str] = []
102
+ self._image_id_to_annotation_dict: Dict[str, COCO2017.Annotation] = {}
103
+ self._image_ratios = []
104
+
105
+ for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
106
+ if len(annotation) > 0:
107
+ image_id = str(annotation[0]['image_id']) # all image_id in annotation are the same
108
+ self._image_ids.append(image_id)
109
+ self._image_id_to_annotation_dict[image_id] = COCO2017.Annotation(
110
+ filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
111
+ objects=[COCO2017.Annotation.Object(
112
+ bbox=BBox( # `ann['bbox']` is in the format [left, top, width, height]
113
+ left=ann['bbox'][0],
114
+ top=ann['bbox'][1],
115
+ right=ann['bbox'][0] + ann['bbox'][2],
116
+ bottom=ann['bbox'][1] + ann['bbox'][3]
117
+ ),
118
+ label=ann['category_id'])
119
+ for ann in annotation]
120
+ )
121
+
122
+ ratio = float(image.width / image.height)
123
+ self._image_ratios.append(ratio)
124
+
125
+ with open(path_to_image_ids_pickle, 'wb') as f:
126
+ pickle.dump(self._image_ids, f)
127
+
128
+ with open(path_to_image_id_dict_pickle, 'wb') as f:
129
+ pickle.dump(self._image_id_to_annotation_dict, f)
130
+
131
+ with open(path_to_image_ratios_pickle, 'wb') as f:
132
+ pickle.dump(self.image_ratios, f)
133
+
134
+ def __len__(self) -> int:
135
+ return len(self._image_id_to_annotation_dict)
136
+
137
+ def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
138
+ image_id = self._image_ids[index]
139
+ annotation = self._image_id_to_annotation_dict[image_id]
140
+
141
+ bboxes = [obj.bbox.tolist() for obj in annotation.objects]
142
+ labels = [obj.label for obj in annotation.objects]
143
+
144
+ bboxes = torch.tensor(bboxes, dtype=torch.float)
145
+ labels = torch.tensor(labels, dtype=torch.long)
146
+
147
+ image = Image.open(annotation.filename).convert('RGB') # for some grayscale images
148
+
149
+ # random flip on only training mode
150
+ if self._mode == COCO2017.Mode.TRAIN and random.random() > 0.5:
151
+ image = ImageOps.mirror(image)
152
+ bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]] # index 0 and 2 represent `left` and `right` respectively
153
+
154
+ image, scale = COCO2017.preprocess(image, self._image_min_side, self._image_max_side)
155
+ scale = torch.tensor(scale, dtype=torch.float)
156
+ bboxes *= scale
157
+
158
+ return image_id, image, scale, bboxes, labels
159
+
160
+ def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
161
+ self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
162
+
163
+ annType = 'bbox'
164
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
165
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
166
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
167
+
168
+ cocoGt = COCO(path_to_annotation)
169
+ cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
170
+
171
+ cocoEval = COCOeval(cocoGt, cocoDt, annType)
172
+ cocoEval.evaluate()
173
+ cocoEval.accumulate()
174
+
175
+ original_stdout = sys.stdout
176
+ string_stdout = StringIO()
177
+ sys.stdout = string_stdout
178
+ cocoEval.summarize()
179
+ sys.stdout = original_stdout
180
+
181
+ mean_ap = cocoEval.stats[0].item() # stats[0] records AP@[0.5:0.95]
182
+ detail = string_stdout.getvalue()
183
+
184
+ return mean_ap, detail
185
+
186
+ def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
187
+ results = []
188
+ for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
189
+ results.append(
190
+ {
191
+ 'image_id': int(image_id), # COCO evaluation requires `image_id` to be type `int`
192
+ 'category_id': cls,
193
+ 'bbox': [ # format [left, top, width, height] is expected
194
+ bbox[0],
195
+ bbox[1],
196
+ bbox[2] - bbox[0],
197
+ bbox[3] - bbox[1]
198
+ ],
199
+ 'score': prob
200
+ }
201
+ )
202
+
203
+ with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
204
+ json.dump(results, f)
205
+
206
+ @property
207
+ def image_ratios(self) -> List[float]:
208
+ return self._image_ratios
209
+
210
+ @staticmethod
211
+ def num_classes() -> int:
212
+ return 92
dataset/coco2017_animal.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pickle
4
+ import random
5
+ import sys
6
+ from io import StringIO
7
+ from typing import List, Tuple, Dict
8
+
9
+ import torch
10
+ import torch.utils.data.dataset
11
+ from PIL import Image, ImageOps
12
+ from pycocotools.coco import COCO
13
+ from pycocotools.cocoeval import COCOeval
14
+ from torch import Tensor
15
+ from torchvision.datasets import CocoDetection
16
+ from tqdm import tqdm
17
+
18
+ from bbox import BBox
19
+ from dataset.base import Base
20
+ from dataset.coco2017 import COCO2017
21
+
22
+
23
+ class COCO2017Animal(Base):
24
+
25
+ class Annotation(object):
26
+ class Object(object):
27
+ def __init__(self, bbox: BBox, label: int):
28
+ super().__init__()
29
+ self.bbox = bbox
30
+ self.label = label
31
+
32
+ def __repr__(self) -> str:
33
+ return 'Object[label={:d}, bbox={!s}]'.format(
34
+ self.label, self.bbox)
35
+
36
+ def __init__(self, filename: str, objects: List[Object]):
37
+ super().__init__()
38
+ self.filename = filename
39
+ self.objects = objects
40
+
41
+ CATEGORY_TO_LABEL_DICT = {
42
+ 'background': 0,
43
+ 'bird': 1, 'cat': 2, 'dog': 3, 'horse': 4, 'sheep': 5,
44
+ 'cow': 6, 'elephant': 7, 'bear': 8, 'zebra': 9, 'giraffe': 10
45
+ }
46
+
47
+ LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
48
+
49
+ def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
50
+ super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
51
+
52
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
53
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
54
+ path_to_caches_dir = os.path.join('caches', 'coco2017-animal', f'{self._mode.value}')
55
+ path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
56
+ path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
57
+ path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
58
+
59
+ if self._mode == COCO2017Animal.Mode.TRAIN:
60
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
61
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
62
+ elif self._mode == COCO2017Animal.Mode.EVAL:
63
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
64
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
65
+ else:
66
+ raise ValueError('invalid mode')
67
+
68
+ coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
69
+
70
+ if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
71
+ print('loading cache files...')
72
+
73
+ with open(path_to_image_ids_pickle, 'rb') as f:
74
+ self._image_ids = pickle.load(f)
75
+
76
+ with open(path_to_image_id_dict_pickle, 'rb') as f:
77
+ self._image_id_to_annotation_dict = pickle.load(f)
78
+
79
+ with open(path_to_image_ratios_pickle, 'rb') as f:
80
+ self._image_ratios = pickle.load(f)
81
+ else:
82
+ print('generating cache files...')
83
+
84
+ os.makedirs(path_to_caches_dir, exist_ok=True)
85
+
86
+ self._image_id_to_annotation_dict: Dict[str, COCO2017Animal.Annotation] = {}
87
+ self._image_ratios = []
88
+
89
+ for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
90
+ if len(annotation) > 0:
91
+ image_id = str(annotation[0]['image_id']) # all image_id in annotation are the same
92
+ annotation = COCO2017Animal.Annotation(
93
+ filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
94
+ objects=[COCO2017Animal.Annotation.Object(
95
+ bbox=BBox( # `ann['bbox']` is in the format [left, top, width, height]
96
+ left=ann['bbox'][0],
97
+ top=ann['bbox'][1],
98
+ right=ann['bbox'][0] + ann['bbox'][2],
99
+ bottom=ann['bbox'][1] + ann['bbox'][3]
100
+ ),
101
+ label=ann['category_id'])
102
+ for ann in annotation]
103
+ )
104
+ annotation.objects = [obj for obj in annotation.objects
105
+ if obj.label in [COCO2017.CATEGORY_TO_LABEL_DICT[category] # filtering label should refer to original `COCO2017` dataset
106
+ for category in COCO2017Animal.CATEGORY_TO_LABEL_DICT.keys()][1:]]
107
+
108
+ if len(annotation.objects) > 0:
109
+ self._image_id_to_annotation_dict[image_id] = annotation
110
+
111
+ ratio = float(image.width / image.height)
112
+ self._image_ratios.append(ratio)
113
+
114
+ self._image_ids = list(self._image_id_to_annotation_dict.keys())
115
+
116
+ with open(path_to_image_ids_pickle, 'wb') as f:
117
+ pickle.dump(self._image_ids, f)
118
+
119
+ with open(path_to_image_id_dict_pickle, 'wb') as f:
120
+ pickle.dump(self._image_id_to_annotation_dict, f)
121
+
122
+ with open(path_to_image_ratios_pickle, 'wb') as f:
123
+ pickle.dump(self.image_ratios, f)
124
+
125
+ def __len__(self) -> int:
126
+ return len(self._image_id_to_annotation_dict)
127
+
128
+ def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
129
+ image_id = self._image_ids[index]
130
+ annotation = self._image_id_to_annotation_dict[image_id]
131
+
132
+ bboxes = [obj.bbox.tolist() for obj in annotation.objects]
133
+ labels = [COCO2017Animal.CATEGORY_TO_LABEL_DICT[COCO2017.LABEL_TO_CATEGORY_DICT[obj.label]] for obj in annotation.objects] # mapping from original `COCO2017` dataset
134
+
135
+ bboxes = torch.tensor(bboxes, dtype=torch.float)
136
+ labels = torch.tensor(labels, dtype=torch.long)
137
+
138
+ image = Image.open(annotation.filename).convert('RGB') # for some grayscale images
139
+
140
+ # random flip on only training mode
141
+ if self._mode == COCO2017Animal.Mode.TRAIN and random.random() > 0.5:
142
+ image = ImageOps.mirror(image)
143
+ bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]] # index 0 and 2 represent `left` and `right` respectively
144
+
145
+ image, scale = COCO2017Animal.preprocess(image, self._image_min_side, self._image_max_side)
146
+ scale = torch.tensor(scale, dtype=torch.float)
147
+ bboxes *= scale
148
+
149
+ return image_id, image, scale, bboxes, labels
150
+
151
+ def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
152
+ self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
153
+
154
+ annType = 'bbox'
155
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
156
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
157
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
158
+
159
+ cocoGt = COCO(path_to_annotation)
160
+ cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
161
+
162
+ cocoEval = COCOeval(cocoGt, cocoDt, annType)
163
+ cocoEval.params.catIds = [COCO2017.CATEGORY_TO_LABEL_DICT[category] # filtering label should refer to original `COCO2017` dataset
164
+ for category in COCO2017Animal.CATEGORY_TO_LABEL_DICT.keys()]
165
+ cocoEval.evaluate()
166
+ cocoEval.accumulate()
167
+
168
+ original_stdout = sys.stdout
169
+ string_stdout = StringIO()
170
+ sys.stdout = string_stdout
171
+ cocoEval.summarize()
172
+ sys.stdout = original_stdout
173
+
174
+ mean_ap = cocoEval.stats[0].item() # stats[0] records AP@[0.5:0.95]
175
+ detail = string_stdout.getvalue()
176
+
177
+ return mean_ap, detail
178
+
179
+ def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
180
+ results = []
181
+ for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
182
+ results.append(
183
+ {
184
+ 'image_id': int(image_id), # COCO evaluation requires `image_id` to be type `int`
185
+ 'category_id': COCO2017.CATEGORY_TO_LABEL_DICT[COCO2017Animal.LABEL_TO_CATEGORY_DICT[cls]], # mapping to original `COCO2017` dataset
186
+ 'bbox': [ # format [left, top, width, height] is expected
187
+ bbox[0],
188
+ bbox[1],
189
+ bbox[2] - bbox[0],
190
+ bbox[3] - bbox[1]
191
+ ],
192
+ 'score': prob
193
+ }
194
+ )
195
+
196
+ with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
197
+ json.dump(results, f)
198
+
199
+ @property
200
+ def image_ratios(self) -> List[float]:
201
+ return self._image_ratios
202
+
203
+ @staticmethod
204
+ def num_classes() -> int:
205
+ return 11
dataset/coco2017_car.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pickle
4
+ import random
5
+ import sys
6
+ from io import StringIO
7
+ from typing import List, Tuple, Dict
8
+
9
+ import torch
10
+ import torch.utils.data.dataset
11
+ from PIL import Image, ImageOps
12
+ from pycocotools.coco import COCO
13
+ from pycocotools.cocoeval import COCOeval
14
+ from torch import Tensor
15
+ from torchvision.datasets import CocoDetection
16
+ from tqdm import tqdm
17
+
18
+ from bbox import BBox
19
+ from dataset.base import Base
20
+ from dataset.coco2017 import COCO2017
21
+
22
+
23
+ class COCO2017Car(Base):
24
+
25
+ class Annotation(object):
26
+ class Object(object):
27
+ def __init__(self, bbox: BBox, label: int):
28
+ super().__init__()
29
+ self.bbox = bbox
30
+ self.label = label
31
+
32
+ def __repr__(self) -> str:
33
+ return 'Object[label={:d}, bbox={!s}]'.format(
34
+ self.label, self.bbox)
35
+
36
+ def __init__(self, filename: str, objects: List[Object]):
37
+ super().__init__()
38
+ self.filename = filename
39
+ self.objects = objects
40
+
41
+ CATEGORY_TO_LABEL_DICT = {
42
+ 'background': 0, 'car': 1
43
+ }
44
+
45
+ LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
46
+
47
+ def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
48
+ super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
49
+
50
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
51
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
52
+ path_to_caches_dir = os.path.join('caches', 'coco2017-car', f'{self._mode.value}')
53
+ path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
54
+ path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
55
+ path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
56
+
57
+ if self._mode == COCO2017Car.Mode.TRAIN:
58
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
59
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
60
+ elif self._mode == COCO2017Car.Mode.EVAL:
61
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
62
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
63
+ else:
64
+ raise ValueError('invalid mode')
65
+
66
+ coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
67
+
68
+ if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
69
+ print('loading cache files...')
70
+
71
+ with open(path_to_image_ids_pickle, 'rb') as f:
72
+ self._image_ids = pickle.load(f)
73
+
74
+ with open(path_to_image_id_dict_pickle, 'rb') as f:
75
+ self._image_id_to_annotation_dict = pickle.load(f)
76
+
77
+ with open(path_to_image_ratios_pickle, 'rb') as f:
78
+ self._image_ratios = pickle.load(f)
79
+ else:
80
+ print('generating cache files...')
81
+
82
+ os.makedirs(path_to_caches_dir, exist_ok=True)
83
+
84
+ self._image_id_to_annotation_dict: Dict[str, COCO2017Car.Annotation] = {}
85
+ self._image_ratios = []
86
+
87
+ for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
88
+ if len(annotation) > 0:
89
+ image_id = str(annotation[0]['image_id']) # all image_id in annotation are the same
90
+ annotation = COCO2017Car.Annotation(
91
+ filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
92
+ objects=[COCO2017Car.Annotation.Object(
93
+ bbox=BBox( # `ann['bbox']` is in the format [left, top, width, height]
94
+ left=ann['bbox'][0],
95
+ top=ann['bbox'][1],
96
+ right=ann['bbox'][0] + ann['bbox'][2],
97
+ bottom=ann['bbox'][1] + ann['bbox'][3]
98
+ ),
99
+ label=ann['category_id'])
100
+ for ann in annotation]
101
+ )
102
+ annotation.objects = [obj for obj in annotation.objects
103
+ if obj.label in [COCO2017.CATEGORY_TO_LABEL_DICT['car']]] # filtering label should refer to original `COCO2017` dataset
104
+
105
+ if len(annotation.objects) > 0:
106
+ self._image_id_to_annotation_dict[image_id] = annotation
107
+
108
+ ratio = float(image.width / image.height)
109
+ self._image_ratios.append(ratio)
110
+
111
+ self._image_ids = list(self._image_id_to_annotation_dict.keys())
112
+
113
+ with open(path_to_image_ids_pickle, 'wb') as f:
114
+ pickle.dump(self._image_ids, f)
115
+
116
+ with open(path_to_image_id_dict_pickle, 'wb') as f:
117
+ pickle.dump(self._image_id_to_annotation_dict, f)
118
+
119
+ with open(path_to_image_ratios_pickle, 'wb') as f:
120
+ pickle.dump(self.image_ratios, f)
121
+
122
+ def __len__(self) -> int:
123
+ return len(self._image_id_to_annotation_dict)
124
+
125
+ def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
126
+ image_id = self._image_ids[index]
127
+ annotation = self._image_id_to_annotation_dict[image_id]
128
+
129
+ bboxes = [obj.bbox.tolist() for obj in annotation.objects]
130
+ labels = [COCO2017Car.CATEGORY_TO_LABEL_DICT[COCO2017.LABEL_TO_CATEGORY_DICT[obj.label]] for obj in annotation.objects] # mapping from original `COCO2017` dataset
131
+
132
+ bboxes = torch.tensor(bboxes, dtype=torch.float)
133
+ labels = torch.tensor(labels, dtype=torch.long)
134
+
135
+ image = Image.open(annotation.filename).convert('RGB') # for some grayscale images
136
+
137
+ # random flip on only training mode
138
+ if self._mode == COCO2017Car.Mode.TRAIN and random.random() > 0.5:
139
+ image = ImageOps.mirror(image)
140
+ bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]] # index 0 and 2 represent `left` and `right` respectively
141
+
142
+ image, scale = COCO2017Car.preprocess(image, self._image_min_side, self._image_max_side)
143
+ scale = torch.tensor(scale, dtype=torch.float)
144
+ bboxes *= scale
145
+
146
+ return image_id, image, scale, bboxes, labels
147
+
148
+ def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
149
+ self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
150
+
151
+ annType = 'bbox'
152
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
153
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
154
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
155
+
156
+ cocoGt = COCO(path_to_annotation)
157
+ cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
158
+
159
+ cocoEval = COCOeval(cocoGt, cocoDt, annType)
160
+ cocoEval.params.catIds = COCO2017.CATEGORY_TO_LABEL_DICT['car'] # filtering label should refer to original `COCO2017` dataset
161
+ cocoEval.evaluate()
162
+ cocoEval.accumulate()
163
+
164
+ original_stdout = sys.stdout
165
+ string_stdout = StringIO()
166
+ sys.stdout = string_stdout
167
+ cocoEval.summarize()
168
+ sys.stdout = original_stdout
169
+
170
+ mean_ap = cocoEval.stats[0].item() # stats[0] records AP@[0.5:0.95]
171
+ detail = string_stdout.getvalue()
172
+
173
+ return mean_ap, detail
174
+
175
+ def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
176
+ results = []
177
+ for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
178
+ results.append(
179
+ {
180
+ 'image_id': int(image_id), # COCO evaluation requires `image_id` to be type `int`
181
+ 'category_id': COCO2017.CATEGORY_TO_LABEL_DICT[COCO2017Car.LABEL_TO_CATEGORY_DICT[cls]], # mapping to original `COCO2017` dataset
182
+ 'bbox': [ # format [left, top, width, height] is expected
183
+ bbox[0],
184
+ bbox[1],
185
+ bbox[2] - bbox[0],
186
+ bbox[3] - bbox[1]
187
+ ],
188
+ 'score': prob
189
+ }
190
+ )
191
+
192
+ with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
193
+ json.dump(results, f)
194
+
195
+ @property
196
+ def image_ratios(self) -> List[float]:
197
+ return self._image_ratios
198
+
199
+ @staticmethod
200
+ def num_classes() -> int:
201
+ return 2
dataset/coco2017_person.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pickle
4
+ import random
5
+ import sys
6
+ from io import StringIO
7
+ from typing import List, Tuple, Dict
8
+
9
+ import torch
10
+ import torch.utils.data.dataset
11
+ from PIL import Image, ImageOps
12
+ from pycocotools.coco import COCO
13
+ from pycocotools.cocoeval import COCOeval
14
+ from torch import Tensor
15
+ from torchvision.datasets import CocoDetection
16
+ from tqdm import tqdm
17
+
18
+ from bbox import BBox
19
+ from dataset.base import Base
20
+ from dataset.coco2017 import COCO2017
21
+
22
+
23
+ class COCO2017Person(Base):
24
+
25
+ class Annotation(object):
26
+ class Object(object):
27
+ def __init__(self, bbox: BBox, label: int):
28
+ super().__init__()
29
+ self.bbox = bbox
30
+ self.label = label
31
+
32
+ def __repr__(self) -> str:
33
+ return 'Object[label={:d}, bbox={!s}]'.format(
34
+ self.label, self.bbox)
35
+
36
+ def __init__(self, filename: str, objects: List[Object]):
37
+ super().__init__()
38
+ self.filename = filename
39
+ self.objects = objects
40
+
41
+ CATEGORY_TO_LABEL_DICT = {
42
+ 'background': 0, 'person': 1
43
+ }
44
+
45
+ LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
46
+
47
+ def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
48
+ super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
49
+
50
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
51
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
52
+ path_to_caches_dir = os.path.join('caches', 'coco2017-person', f'{self._mode.value}')
53
+ path_to_image_ids_pickle = os.path.join(path_to_caches_dir, 'image-ids.pkl')
54
+ path_to_image_id_dict_pickle = os.path.join(path_to_caches_dir, 'image-id-dict.pkl')
55
+ path_to_image_ratios_pickle = os.path.join(path_to_caches_dir, 'image-ratios.pkl')
56
+
57
+ if self._mode == COCO2017Person.Mode.TRAIN:
58
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'train2017')
59
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_train2017.json')
60
+ elif self._mode == COCO2017Person.Mode.EVAL:
61
+ path_to_jpeg_images_dir = os.path.join(path_to_coco_dir, 'val2017')
62
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
63
+ else:
64
+ raise ValueError('invalid mode')
65
+
66
+ coco_dataset = CocoDetection(root=path_to_jpeg_images_dir, annFile=path_to_annotation)
67
+
68
+ if os.path.exists(path_to_image_ids_pickle) and os.path.exists(path_to_image_id_dict_pickle):
69
+ print('loading cache files...')
70
+
71
+ with open(path_to_image_ids_pickle, 'rb') as f:
72
+ self._image_ids = pickle.load(f)
73
+
74
+ with open(path_to_image_id_dict_pickle, 'rb') as f:
75
+ self._image_id_to_annotation_dict = pickle.load(f)
76
+
77
+ with open(path_to_image_ratios_pickle, 'rb') as f:
78
+ self._image_ratios = pickle.load(f)
79
+ else:
80
+ print('generating cache files...')
81
+
82
+ os.makedirs(path_to_caches_dir, exist_ok=True)
83
+
84
+ self._image_id_to_annotation_dict: Dict[str, COCO2017Person.Annotation] = {}
85
+ self._image_ratios = []
86
+
87
+ for idx, (image, annotation) in enumerate(tqdm(coco_dataset)):
88
+ if len(annotation) > 0:
89
+ image_id = str(annotation[0]['image_id']) # all image_id in annotation are the same
90
+ annotation = COCO2017Person.Annotation(
91
+ filename=os.path.join(path_to_jpeg_images_dir, '{:012d}.jpg'.format(int(image_id))),
92
+ objects=[COCO2017Person.Annotation.Object(
93
+ bbox=BBox( # `ann['bbox']` is in the format [left, top, width, height]
94
+ left=ann['bbox'][0],
95
+ top=ann['bbox'][1],
96
+ right=ann['bbox'][0] + ann['bbox'][2],
97
+ bottom=ann['bbox'][1] + ann['bbox'][3]
98
+ ),
99
+ label=ann['category_id'])
100
+ for ann in annotation]
101
+ )
102
+ annotation.objects = [obj for obj in annotation.objects
103
+ if obj.label in [COCO2017.CATEGORY_TO_LABEL_DICT['person']]] # filtering label should refer to original `COCO2017` dataset
104
+
105
+ if len(annotation.objects) > 0:
106
+ self._image_id_to_annotation_dict[image_id] = annotation
107
+
108
+ ratio = float(image.width / image.height)
109
+ self._image_ratios.append(ratio)
110
+
111
+ self._image_ids = list(self._image_id_to_annotation_dict.keys())
112
+
113
+ with open(path_to_image_ids_pickle, 'wb') as f:
114
+ pickle.dump(self._image_ids, f)
115
+
116
+ with open(path_to_image_id_dict_pickle, 'wb') as f:
117
+ pickle.dump(self._image_id_to_annotation_dict, f)
118
+
119
+ with open(path_to_image_ratios_pickle, 'wb') as f:
120
+ pickle.dump(self.image_ratios, f)
121
+
122
+ def __len__(self) -> int:
123
+ return len(self._image_id_to_annotation_dict)
124
+
125
+ def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
126
+ image_id = self._image_ids[index]
127
+ annotation = self._image_id_to_annotation_dict[image_id]
128
+
129
+ bboxes = [obj.bbox.tolist() for obj in annotation.objects]
130
+ labels = [COCO2017Person.CATEGORY_TO_LABEL_DICT[COCO2017.LABEL_TO_CATEGORY_DICT[obj.label]] for obj in annotation.objects] # mapping from original `COCO2017` dataset
131
+
132
+ bboxes = torch.tensor(bboxes, dtype=torch.float)
133
+ labels = torch.tensor(labels, dtype=torch.long)
134
+
135
+ image = Image.open(annotation.filename).convert('RGB') # for some grayscale images
136
+
137
+ # random flip on only training mode
138
+ if self._mode == COCO2017Person.Mode.TRAIN and random.random() > 0.5:
139
+ image = ImageOps.mirror(image)
140
+ bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]] # index 0 and 2 represent `left` and `right` respectively
141
+
142
+ image, scale = COCO2017Person.preprocess(image, self._image_min_side, self._image_max_side)
143
+ scale = torch.tensor(scale, dtype=torch.float)
144
+ bboxes *= scale
145
+
146
+ return image_id, image, scale, bboxes, labels
147
+
148
+ def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
149
+ self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
150
+
151
+ annType = 'bbox'
152
+ path_to_coco_dir = os.path.join(self._path_to_data_dir, 'COCO')
153
+ path_to_annotations_dir = os.path.join(path_to_coco_dir, 'annotations')
154
+ path_to_annotation = os.path.join(path_to_annotations_dir, 'instances_val2017.json')
155
+
156
+ cocoGt = COCO(path_to_annotation)
157
+ cocoDt = cocoGt.loadRes(os.path.join(path_to_results_dir, 'results.json'))
158
+
159
+ cocoEval = COCOeval(cocoGt, cocoDt, annType)
160
+ cocoEval.params.catIds = COCO2017.CATEGORY_TO_LABEL_DICT['person'] # filtering label should refer to original `COCO2017` dataset
161
+ cocoEval.evaluate()
162
+ cocoEval.accumulate()
163
+
164
+ original_stdout = sys.stdout
165
+ string_stdout = StringIO()
166
+ sys.stdout = string_stdout
167
+ cocoEval.summarize()
168
+ sys.stdout = original_stdout
169
+
170
+ mean_ap = cocoEval.stats[0].item() # stats[0] records AP@[0.5:0.95]
171
+ detail = string_stdout.getvalue()
172
+
173
+ return mean_ap, detail
174
+
175
+ def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
176
+ results = []
177
+ for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
178
+ results.append(
179
+ {
180
+ 'image_id': int(image_id), # COCO evaluation requires `image_id` to be type `int`
181
+ 'category_id': COCO2017.CATEGORY_TO_LABEL_DICT[COCO2017Person.LABEL_TO_CATEGORY_DICT[cls]], # mapping to original `COCO2017` dataset
182
+ 'bbox': [ # format [left, top, width, height] is expected
183
+ bbox[0],
184
+ bbox[1],
185
+ bbox[2] - bbox[0],
186
+ bbox[3] - bbox[1]
187
+ ],
188
+ 'score': prob
189
+ }
190
+ )
191
+
192
+ with open(os.path.join(path_to_results_dir, 'results.json'), 'w') as f:
193
+ json.dump(results, f)
194
+
195
+ @property
196
+ def image_ratios(self) -> List[float]:
197
+ return self._image_ratios
198
+
199
+ @staticmethod
200
+ def num_classes() -> int:
201
+ return 2
dataset/voc2007.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import xml.etree.ElementTree as ET
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ import torch.utils.data
8
+ from PIL import Image, ImageOps
9
+ from torch import Tensor
10
+
11
+ from bbox import BBox
12
+ from dataset.base import Base
13
+ from voc_eval import voc_eval
14
+
15
+
16
+ class VOC2007(Base):
17
+
18
+ class Annotation(object):
19
+ class Object(object):
20
+ def __init__(self, name: str, difficult: bool, bbox: BBox):
21
+ super().__init__()
22
+ self.name = name
23
+ self.difficult = difficult
24
+ self.bbox = bbox
25
+
26
+ def __repr__(self) -> str:
27
+ return 'Object[name={:s}, difficult={!s}, bbox={!s}]'.format(
28
+ self.name, self.difficult, self.bbox)
29
+
30
+ def __init__(self, filename: str, objects: List[Object]):
31
+ super().__init__()
32
+ self.filename = filename
33
+ self.objects = objects
34
+
35
+ CATEGORY_TO_LABEL_DICT = {
36
+ 'background': 0,
37
+ 'aeroplane': 1, 'bicycle': 2, 'bird': 3, 'boat': 4, 'bottle': 5,
38
+ 'bus': 6, 'car': 7, 'cat': 8, 'chair': 9, 'cow': 10,
39
+ 'diningtable': 11, 'dog': 12, 'horse': 13, 'motorbike': 14, 'person': 15,
40
+ 'pottedplant': 16, 'sheep': 17, 'sofa': 18, 'train': 19, 'tvmonitor': 20
41
+ }
42
+
43
+ LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
44
+
45
+ def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
46
+ super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
47
+
48
+ path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
49
+ path_to_imagesets_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
50
+ path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
51
+ self._path_to_jpeg_images_dir = os.path.join(path_to_voc2007_dir, 'JPEGImages')
52
+
53
+ if self._mode == VOC2007.Mode.TRAIN:
54
+ path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'trainval.txt')
55
+ elif self._mode == VOC2007.Mode.EVAL:
56
+ path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'test.txt')
57
+ else:
58
+ raise ValueError('invalid mode')
59
+
60
+ with open(path_to_image_ids_txt, 'r') as f:
61
+ lines = f.readlines()
62
+ self._image_ids = [line.rstrip() for line in lines]
63
+
64
+ self._image_id_to_annotation_dict = {}
65
+ self._image_ratios = []
66
+
67
+ for image_id in self._image_ids:
68
+ path_to_annotation_xml = os.path.join(path_to_annotations_dir, f'{image_id}.xml')
69
+ tree = ET.ElementTree(file=path_to_annotation_xml)
70
+ root = tree.getroot()
71
+
72
+ self._image_id_to_annotation_dict[image_id] = VOC2007.Annotation(
73
+ filename=root.find('filename').text,
74
+ objects=[VOC2007.Annotation.Object(
75
+ name=next(tag_object.iterfind('name')).text,
76
+ difficult=next(tag_object.iterfind('difficult')).text == '1',
77
+ bbox=BBox( # convert to 0-based pixel index
78
+ left=float(next(tag_object.iterfind('bndbox/xmin')).text) - 1,
79
+ top=float(next(tag_object.iterfind('bndbox/ymin')).text) - 1,
80
+ right=float(next(tag_object.iterfind('bndbox/xmax')).text) - 1,
81
+ bottom=float(next(tag_object.iterfind('bndbox/ymax')).text) - 1
82
+ )
83
+ ) for tag_object in root.iterfind('object')]
84
+ )
85
+
86
+ width = int(root.find('size/width').text)
87
+ height = int(root.find('size/height').text)
88
+ ratio = float(width / height)
89
+ self._image_ratios.append(ratio)
90
+
91
+ def __len__(self) -> int:
92
+ return len(self._image_id_to_annotation_dict)
93
+
94
+ def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
95
+ image_id = self._image_ids[index]
96
+ annotation = self._image_id_to_annotation_dict[image_id]
97
+
98
+ bboxes = [obj.bbox.tolist() for obj in annotation.objects if not obj.difficult]
99
+ labels = [VOC2007.CATEGORY_TO_LABEL_DICT[obj.name] for obj in annotation.objects if not obj.difficult]
100
+
101
+ bboxes = torch.tensor(bboxes, dtype=torch.float)
102
+ labels = torch.tensor(labels, dtype=torch.long)
103
+
104
+ image = Image.open(os.path.join(self._path_to_jpeg_images_dir, annotation.filename))
105
+
106
+ # random flip on only training mode
107
+ if self._mode == VOC2007.Mode.TRAIN and random.random() > 0.5:
108
+ image = ImageOps.mirror(image)
109
+ bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]] # index 0 and 2 represent `left` and `right` respectively
110
+
111
+ image, scale = VOC2007.preprocess(image, self._image_min_side, self._image_max_side)
112
+ scale = torch.tensor(scale, dtype=torch.float)
113
+ bboxes *= scale
114
+
115
+ return image_id, image, scale, bboxes, labels
116
+
117
+ def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
118
+ self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
119
+
120
+ path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
121
+ path_to_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
122
+ path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
123
+
124
+ class_to_ap_dict = {}
125
+ for c in range(1, VOC2007.num_classes()):
126
+ category = VOC2007.LABEL_TO_CATEGORY_DICT[c]
127
+ try:
128
+ path_to_cache_dir = os.path.join('caches', 'voc2007')
129
+ os.makedirs(path_to_cache_dir, exist_ok=True)
130
+ _, _, ap = voc_eval(detpath=path_to_results_dir+'/comp3_det_test_{:s}.txt'.format(category),
131
+ annopath=path_to_annotations_dir+'/{:s}.xml',
132
+ imagesetfile=os.path.join(path_to_main_dir, 'test.txt'),
133
+ classname=category,
134
+ cachedir=path_to_cache_dir,
135
+ ovthresh=0.5,
136
+ use_07_metric=True)
137
+ except IndexError:
138
+ ap = 0
139
+
140
+ class_to_ap_dict[c] = ap
141
+
142
+ mean_ap = np.mean([v for k, v in class_to_ap_dict.items()]).item()
143
+
144
+ detail = ''
145
+ for c in range(1, VOC2007.num_classes()):
146
+ detail += '{:d}: {:s} AP = {:.4f}\n'.format(c, VOC2007.LABEL_TO_CATEGORY_DICT[c], class_to_ap_dict[c])
147
+
148
+ return mean_ap, detail
149
+
150
+ def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
151
+ class_to_txt_files_dict = {}
152
+ for c in range(1, VOC2007.num_classes()):
153
+ class_to_txt_files_dict[c] = open(os.path.join(path_to_results_dir, 'comp3_det_test_{:s}.txt'.format(VOC2007.LABEL_TO_CATEGORY_DICT[c])), 'w')
154
+
155
+ for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
156
+ class_to_txt_files_dict[cls].write('{:s} {:f} {:f} {:f} {:f} {:f}\n'.format(image_id, prob,
157
+ bbox[0], bbox[1], bbox[2], bbox[3]))
158
+
159
+ for _, f in class_to_txt_files_dict.items():
160
+ f.close()
161
+
162
+ @property
163
+ def image_ratios(self) -> List[float]:
164
+ return self._image_ratios
165
+
166
+ @staticmethod
167
+ def num_classes() -> int:
168
+ return 21
dataset/voc2007_cat_dog.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import xml.etree.ElementTree as ET
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ import torch.utils.data
8
+ from PIL import Image, ImageOps
9
+ from torch import Tensor
10
+
11
+ from bbox import BBox
12
+ from dataset.base import Base
13
+ from voc_eval import voc_eval
14
+
15
+
16
+ class VOC2007CatDog(Base):
17
+
18
+ class Annotation(object):
19
+ class Object(object):
20
+ def __init__(self, name: str, difficult: bool, bbox: BBox):
21
+ super().__init__()
22
+ self.name = name
23
+ self.difficult = difficult
24
+ self.bbox = bbox
25
+
26
+ def __repr__(self) -> str:
27
+ return 'Object[name={:s}, difficult={!s}, bbox={!s}]'.format(
28
+ self.name, self.difficult, self.bbox)
29
+
30
+ def __init__(self, filename: str, objects: List[Object]):
31
+ super().__init__()
32
+ self.filename = filename
33
+ self.objects = objects
34
+
35
+ CATEGORY_TO_LABEL_DICT = {
36
+ 'background': 0,
37
+ 'cat': 1, 'dog': 2
38
+ }
39
+
40
+ LABEL_TO_CATEGORY_DICT = {v: k for k, v in CATEGORY_TO_LABEL_DICT.items()}
41
+
42
+ def __init__(self, path_to_data_dir: str, mode: Base.Mode, image_min_side: float, image_max_side: float):
43
+ super().__init__(path_to_data_dir, mode, image_min_side, image_max_side)
44
+
45
+ path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
46
+ path_to_imagesets_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
47
+ path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
48
+ self._path_to_jpeg_images_dir = os.path.join(path_to_voc2007_dir, 'JPEGImages')
49
+
50
+ if self._mode == VOC2007CatDog.Mode.TRAIN:
51
+ path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'trainval.txt')
52
+ elif self._mode == VOC2007CatDog.Mode.EVAL:
53
+ path_to_image_ids_txt = os.path.join(path_to_imagesets_main_dir, 'test.txt')
54
+ else:
55
+ raise ValueError('invalid mode')
56
+
57
+ with open(path_to_image_ids_txt, 'r') as f:
58
+ lines = f.readlines()
59
+ image_ids = [line.rstrip() for line in lines]
60
+
61
+ self._image_id_to_annotation_dict = {}
62
+ self._image_ratios = []
63
+
64
+ for image_id in image_ids:
65
+ path_to_annotation_xml = os.path.join(path_to_annotations_dir, f'{image_id}.xml')
66
+ tree = ET.ElementTree(file=path_to_annotation_xml)
67
+ root = tree.getroot()
68
+
69
+ annotation = VOC2007CatDog.Annotation(
70
+ filename=root.find('filename').text,
71
+ objects=[VOC2007CatDog.Annotation.Object(
72
+ name=next(tag_object.iterfind('name')).text,
73
+ difficult=next(tag_object.iterfind('difficult')).text == '1',
74
+ bbox=BBox( # convert to 0-based pixel index
75
+ left=float(next(tag_object.iterfind('bndbox/xmin')).text) - 1,
76
+ top=float(next(tag_object.iterfind('bndbox/ymin')).text) - 1,
77
+ right=float(next(tag_object.iterfind('bndbox/xmax')).text) - 1,
78
+ bottom=float(next(tag_object.iterfind('bndbox/ymax')).text) - 1
79
+ )
80
+ ) for tag_object in root.iterfind('object')]
81
+ )
82
+ annotation.objects = [obj for obj in annotation.objects if obj.name in ['cat', 'dog'] and not obj.difficult]
83
+
84
+ if len(annotation.objects) > 0:
85
+ self._image_id_to_annotation_dict[image_id] = annotation
86
+
87
+ width = int(root.find('size/width').text)
88
+ height = int(root.find('size/height').text)
89
+ ratio = float(width / height)
90
+ self._image_ratios.append(ratio)
91
+
92
+ self._image_ids = list(self._image_id_to_annotation_dict.keys())
93
+
94
+ def __len__(self) -> int:
95
+ return len(self._image_id_to_annotation_dict)
96
+
97
+ def __getitem__(self, index: int) -> Tuple[str, Tensor, Tensor, Tensor, Tensor]:
98
+ image_id = self._image_ids[index]
99
+ annotation = self._image_id_to_annotation_dict[image_id]
100
+
101
+ bboxes = [obj.bbox.tolist() for obj in annotation.objects]
102
+ labels = [VOC2007CatDog.CATEGORY_TO_LABEL_DICT[obj.name] for obj in annotation.objects]
103
+
104
+ bboxes = torch.tensor(bboxes, dtype=torch.float)
105
+ labels = torch.tensor(labels, dtype=torch.long)
106
+
107
+ image = Image.open(os.path.join(self._path_to_jpeg_images_dir, annotation.filename))
108
+
109
+ # random flip on only training mode
110
+ if self._mode == VOC2007CatDog.Mode.TRAIN and random.random() > 0.5:
111
+ image = ImageOps.mirror(image)
112
+ bboxes[:, [0, 2]] = image.width - bboxes[:, [2, 0]] # index 0 and 2 represent `left` and `right` respectively
113
+
114
+ image, scale = VOC2007CatDog.preprocess(image, self._image_min_side, self._image_max_side)
115
+ scale = torch.tensor(scale, dtype=torch.float)
116
+ bboxes *= scale
117
+
118
+ return image_id, image, scale, bboxes, labels
119
+
120
+ def evaluate(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]) -> Tuple[float, str]:
121
+ self._write_results(path_to_results_dir, image_ids, bboxes, classes, probs)
122
+
123
+ path_to_voc2007_dir = os.path.join(self._path_to_data_dir, 'VOCdevkit', 'VOC2007')
124
+ path_to_main_dir = os.path.join(path_to_voc2007_dir, 'ImageSets', 'Main')
125
+ path_to_annotations_dir = os.path.join(path_to_voc2007_dir, 'Annotations')
126
+
127
+ class_to_ap_dict = {}
128
+ for c in range(1, VOC2007CatDog.num_classes()):
129
+ category = VOC2007CatDog.LABEL_TO_CATEGORY_DICT[c]
130
+ try:
131
+ path_to_cache_dir = os.path.join('caches', 'voc2007-cat-dog')
132
+ os.makedirs(path_to_cache_dir, exist_ok=True)
133
+ _, _, ap = voc_eval(detpath=os.path.join(path_to_results_dir, 'comp3_det_test_{:s}.txt'.format(category)),
134
+ annopath=os.path.join(path_to_annotations_dir, '{:s}.xml'),
135
+ imagesetfile=os.path.join(path_to_main_dir, 'test.txt'),
136
+ classname=category,
137
+ cachedir=path_to_cache_dir,
138
+ ovthresh=0.5,
139
+ use_07_metric=True)
140
+ except IndexError:
141
+ ap = 0
142
+
143
+ class_to_ap_dict[c] = ap
144
+
145
+ mean_ap = np.mean([v for k, v in class_to_ap_dict.items()]).item()
146
+
147
+ detail = ''
148
+ for c in range(1, VOC2007CatDog.num_classes()):
149
+ detail += '{:d}: {:s} AP = {:.4f}\n'.format(c, VOC2007CatDog.LABEL_TO_CATEGORY_DICT[c], class_to_ap_dict[c])
150
+
151
+ return mean_ap, detail
152
+
153
+ def _write_results(self, path_to_results_dir: str, image_ids: List[str], bboxes: List[List[float]], classes: List[int], probs: List[float]):
154
+ class_to_txt_files_dict = {}
155
+ for c in range(1, VOC2007CatDog.num_classes()):
156
+ class_to_txt_files_dict[c] = open(os.path.join(path_to_results_dir, 'comp3_det_test_{:s}.txt'.format(VOC2007CatDog.LABEL_TO_CATEGORY_DICT[c])), 'w')
157
+
158
+ for image_id, bbox, cls, prob in zip(image_ids, bboxes, classes, probs):
159
+ class_to_txt_files_dict[cls].write('{:s} {:f} {:f} {:f} {:f} {:f}\n'.format(image_id, prob,
160
+ bbox[0], bbox[1], bbox[2], bbox[3]))
161
+
162
+ for _, f in class_to_txt_files_dict.items():
163
+ f.close()
164
+
165
+ @property
166
+ def image_ratios(self) -> List[float]:
167
+ return self._image_ratios
168
+
169
+ @staticmethod
170
+ def num_classes() -> int:
171
+ return 3
extension/functional.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from torch import Tensor
4
+
5
+
6
+ def beta_smooth_l1_loss(input: Tensor, target: Tensor, beta: float) -> Tensor:
7
+ diff = torch.abs(input - target)
8
+ loss = torch.where(diff < beta, 0.5 * diff ** 2 / beta, diff - 0.5 * beta)
9
+ loss = loss.sum() / (input.numel() + 1e-8)
10
+ return loss
extension/lr_scheduler.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from torch.optim import Optimizer
4
+ from torch.optim.lr_scheduler import MultiStepLR
5
+
6
+
7
+ class WarmUpMultiStepLR(MultiStepLR):
8
+ def __init__(self, optimizer: Optimizer, milestones: List[int], gamma: float = 0.1,
9
+ factor: float = 0.3333, num_iters: int = 500, last_epoch: int = -1):
10
+ self.factor = factor
11
+ self.num_iters = num_iters
12
+ super().__init__(optimizer, milestones, gamma, last_epoch)
13
+
14
+ def get_lr(self) -> List[float]:
15
+ if self.last_epoch < self.num_iters:
16
+ alpha = self.last_epoch / self.num_iters
17
+ factor = (1 - self.factor) * alpha + self.factor
18
+ return [lr * factor for lr in super()._get_closed_form_lr()]
19
+ else:
20
+ factor = 1
21
+ return [lr for lr in super().get_lr()]
22
+
23
+ return [lr * factor for lr in super()._get_closed_form_lr()]
models/MobileNetSSD_deploy.caffemodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:761c86fbae3d8361dd454f7c740a964f62975ed32f4324b8b85994edec30f6af
3
+ size 23147564
models/MobileNetSSD_deploy.prototxt.txt ADDED
@@ -0,0 +1,1912 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "MobileNet-SSD"
2
+ input: "data"
3
+ input_shape {
4
+ dim: 1
5
+ dim: 3
6
+ dim: 300
7
+ dim: 300
8
+ }
9
+ layer {
10
+ name: "conv0"
11
+ type: "Convolution"
12
+ bottom: "data"
13
+ top: "conv0"
14
+ param {
15
+ lr_mult: 1.0
16
+ decay_mult: 1.0
17
+ }
18
+ param {
19
+ lr_mult: 2.0
20
+ decay_mult: 0.0
21
+ }
22
+ convolution_param {
23
+ num_output: 32
24
+ pad: 1
25
+ kernel_size: 3
26
+ stride: 2
27
+ weight_filler {
28
+ type: "msra"
29
+ }
30
+ bias_filler {
31
+ type: "constant"
32
+ value: 0.0
33
+ }
34
+ }
35
+ }
36
+ layer {
37
+ name: "conv0/relu"
38
+ type: "ReLU"
39
+ bottom: "conv0"
40
+ top: "conv0"
41
+ }
42
+ layer {
43
+ name: "conv1/dw"
44
+ type: "Convolution"
45
+ bottom: "conv0"
46
+ top: "conv1/dw"
47
+ param {
48
+ lr_mult: 1.0
49
+ decay_mult: 1.0
50
+ }
51
+ param {
52
+ lr_mult: 2.0
53
+ decay_mult: 0.0
54
+ }
55
+ convolution_param {
56
+ num_output: 32
57
+ pad: 1
58
+ kernel_size: 3
59
+ group: 32
60
+ engine: CAFFE
61
+ weight_filler {
62
+ type: "msra"
63
+ }
64
+ bias_filler {
65
+ type: "constant"
66
+ value: 0.0
67
+ }
68
+ }
69
+ }
70
+ layer {
71
+ name: "conv1/dw/relu"
72
+ type: "ReLU"
73
+ bottom: "conv1/dw"
74
+ top: "conv1/dw"
75
+ }
76
+ layer {
77
+ name: "conv1"
78
+ type: "Convolution"
79
+ bottom: "conv1/dw"
80
+ top: "conv1"
81
+ param {
82
+ lr_mult: 1.0
83
+ decay_mult: 1.0
84
+ }
85
+ param {
86
+ lr_mult: 2.0
87
+ decay_mult: 0.0
88
+ }
89
+ convolution_param {
90
+ num_output: 64
91
+ kernel_size: 1
92
+ weight_filler {
93
+ type: "msra"
94
+ }
95
+ bias_filler {
96
+ type: "constant"
97
+ value: 0.0
98
+ }
99
+ }
100
+ }
101
+ layer {
102
+ name: "conv1/relu"
103
+ type: "ReLU"
104
+ bottom: "conv1"
105
+ top: "conv1"
106
+ }
107
+ layer {
108
+ name: "conv2/dw"
109
+ type: "Convolution"
110
+ bottom: "conv1"
111
+ top: "conv2/dw"
112
+ param {
113
+ lr_mult: 1.0
114
+ decay_mult: 1.0
115
+ }
116
+ param {
117
+ lr_mult: 2.0
118
+ decay_mult: 0.0
119
+ }
120
+ convolution_param {
121
+ num_output: 64
122
+ pad: 1
123
+ kernel_size: 3
124
+ stride: 2
125
+ group: 64
126
+ engine: CAFFE
127
+ weight_filler {
128
+ type: "msra"
129
+ }
130
+ bias_filler {
131
+ type: "constant"
132
+ value: 0.0
133
+ }
134
+ }
135
+ }
136
+ layer {
137
+ name: "conv2/dw/relu"
138
+ type: "ReLU"
139
+ bottom: "conv2/dw"
140
+ top: "conv2/dw"
141
+ }
142
+ layer {
143
+ name: "conv2"
144
+ type: "Convolution"
145
+ bottom: "conv2/dw"
146
+ top: "conv2"
147
+ param {
148
+ lr_mult: 1.0
149
+ decay_mult: 1.0
150
+ }
151
+ param {
152
+ lr_mult: 2.0
153
+ decay_mult: 0.0
154
+ }
155
+ convolution_param {
156
+ num_output: 128
157
+ kernel_size: 1
158
+ weight_filler {
159
+ type: "msra"
160
+ }
161
+ bias_filler {
162
+ type: "constant"
163
+ value: 0.0
164
+ }
165
+ }
166
+ }
167
+ layer {
168
+ name: "conv2/relu"
169
+ type: "ReLU"
170
+ bottom: "conv2"
171
+ top: "conv2"
172
+ }
173
+ layer {
174
+ name: "conv3/dw"
175
+ type: "Convolution"
176
+ bottom: "conv2"
177
+ top: "conv3/dw"
178
+ param {
179
+ lr_mult: 1.0
180
+ decay_mult: 1.0
181
+ }
182
+ param {
183
+ lr_mult: 2.0
184
+ decay_mult: 0.0
185
+ }
186
+ convolution_param {
187
+ num_output: 128
188
+ pad: 1
189
+ kernel_size: 3
190
+ group: 128
191
+ engine: CAFFE
192
+ weight_filler {
193
+ type: "msra"
194
+ }
195
+ bias_filler {
196
+ type: "constant"
197
+ value: 0.0
198
+ }
199
+ }
200
+ }
201
+ layer {
202
+ name: "conv3/dw/relu"
203
+ type: "ReLU"
204
+ bottom: "conv3/dw"
205
+ top: "conv3/dw"
206
+ }
207
+ layer {
208
+ name: "conv3"
209
+ type: "Convolution"
210
+ bottom: "conv3/dw"
211
+ top: "conv3"
212
+ param {
213
+ lr_mult: 1.0
214
+ decay_mult: 1.0
215
+ }
216
+ param {
217
+ lr_mult: 2.0
218
+ decay_mult: 0.0
219
+ }
220
+ convolution_param {
221
+ num_output: 128
222
+ kernel_size: 1
223
+ weight_filler {
224
+ type: "msra"
225
+ }
226
+ bias_filler {
227
+ type: "constant"
228
+ value: 0.0
229
+ }
230
+ }
231
+ }
232
+ layer {
233
+ name: "conv3/relu"
234
+ type: "ReLU"
235
+ bottom: "conv3"
236
+ top: "conv3"
237
+ }
238
+ layer {
239
+ name: "conv4/dw"
240
+ type: "Convolution"
241
+ bottom: "conv3"
242
+ top: "conv4/dw"
243
+ param {
244
+ lr_mult: 1.0
245
+ decay_mult: 1.0
246
+ }
247
+ param {
248
+ lr_mult: 2.0
249
+ decay_mult: 0.0
250
+ }
251
+ convolution_param {
252
+ num_output: 128
253
+ pad: 1
254
+ kernel_size: 3
255
+ stride: 2
256
+ group: 128
257
+ engine: CAFFE
258
+ weight_filler {
259
+ type: "msra"
260
+ }
261
+ bias_filler {
262
+ type: "constant"
263
+ value: 0.0
264
+ }
265
+ }
266
+ }
267
+ layer {
268
+ name: "conv4/dw/relu"
269
+ type: "ReLU"
270
+ bottom: "conv4/dw"
271
+ top: "conv4/dw"
272
+ }
273
+ layer {
274
+ name: "conv4"
275
+ type: "Convolution"
276
+ bottom: "conv4/dw"
277
+ top: "conv4"
278
+ param {
279
+ lr_mult: 1.0
280
+ decay_mult: 1.0
281
+ }
282
+ param {
283
+ lr_mult: 2.0
284
+ decay_mult: 0.0
285
+ }
286
+ convolution_param {
287
+ num_output: 256
288
+ kernel_size: 1
289
+ weight_filler {
290
+ type: "msra"
291
+ }
292
+ bias_filler {
293
+ type: "constant"
294
+ value: 0.0
295
+ }
296
+ }
297
+ }
298
+ layer {
299
+ name: "conv4/relu"
300
+ type: "ReLU"
301
+ bottom: "conv4"
302
+ top: "conv4"
303
+ }
304
+ layer {
305
+ name: "conv5/dw"
306
+ type: "Convolution"
307
+ bottom: "conv4"
308
+ top: "conv5/dw"
309
+ param {
310
+ lr_mult: 1.0
311
+ decay_mult: 1.0
312
+ }
313
+ param {
314
+ lr_mult: 2.0
315
+ decay_mult: 0.0
316
+ }
317
+ convolution_param {
318
+ num_output: 256
319
+ pad: 1
320
+ kernel_size: 3
321
+ group: 256
322
+ engine: CAFFE
323
+ weight_filler {
324
+ type: "msra"
325
+ }
326
+ bias_filler {
327
+ type: "constant"
328
+ value: 0.0
329
+ }
330
+ }
331
+ }
332
+ layer {
333
+ name: "conv5/dw/relu"
334
+ type: "ReLU"
335
+ bottom: "conv5/dw"
336
+ top: "conv5/dw"
337
+ }
338
+ layer {
339
+ name: "conv5"
340
+ type: "Convolution"
341
+ bottom: "conv5/dw"
342
+ top: "conv5"
343
+ param {
344
+ lr_mult: 1.0
345
+ decay_mult: 1.0
346
+ }
347
+ param {
348
+ lr_mult: 2.0
349
+ decay_mult: 0.0
350
+ }
351
+ convolution_param {
352
+ num_output: 256
353
+ kernel_size: 1
354
+ weight_filler {
355
+ type: "msra"
356
+ }
357
+ bias_filler {
358
+ type: "constant"
359
+ value: 0.0
360
+ }
361
+ }
362
+ }
363
+ layer {
364
+ name: "conv5/relu"
365
+ type: "ReLU"
366
+ bottom: "conv5"
367
+ top: "conv5"
368
+ }
369
+ layer {
370
+ name: "conv6/dw"
371
+ type: "Convolution"
372
+ bottom: "conv5"
373
+ top: "conv6/dw"
374
+ param {
375
+ lr_mult: 1.0
376
+ decay_mult: 1.0
377
+ }
378
+ param {
379
+ lr_mult: 2.0
380
+ decay_mult: 0.0
381
+ }
382
+ convolution_param {
383
+ num_output: 256
384
+ pad: 1
385
+ kernel_size: 3
386
+ stride: 2
387
+ group: 256
388
+ engine: CAFFE
389
+ weight_filler {
390
+ type: "msra"
391
+ }
392
+ bias_filler {
393
+ type: "constant"
394
+ value: 0.0
395
+ }
396
+ }
397
+ }
398
+ layer {
399
+ name: "conv6/dw/relu"
400
+ type: "ReLU"
401
+ bottom: "conv6/dw"
402
+ top: "conv6/dw"
403
+ }
404
+ layer {
405
+ name: "conv6"
406
+ type: "Convolution"
407
+ bottom: "conv6/dw"
408
+ top: "conv6"
409
+ param {
410
+ lr_mult: 1.0
411
+ decay_mult: 1.0
412
+ }
413
+ param {
414
+ lr_mult: 2.0
415
+ decay_mult: 0.0
416
+ }
417
+ convolution_param {
418
+ num_output: 512
419
+ kernel_size: 1
420
+ weight_filler {
421
+ type: "msra"
422
+ }
423
+ bias_filler {
424
+ type: "constant"
425
+ value: 0.0
426
+ }
427
+ }
428
+ }
429
+ layer {
430
+ name: "conv6/relu"
431
+ type: "ReLU"
432
+ bottom: "conv6"
433
+ top: "conv6"
434
+ }
435
+ layer {
436
+ name: "conv7/dw"
437
+ type: "Convolution"
438
+ bottom: "conv6"
439
+ top: "conv7/dw"
440
+ param {
441
+ lr_mult: 1.0
442
+ decay_mult: 1.0
443
+ }
444
+ param {
445
+ lr_mult: 2.0
446
+ decay_mult: 0.0
447
+ }
448
+ convolution_param {
449
+ num_output: 512
450
+ pad: 1
451
+ kernel_size: 3
452
+ group: 512
453
+ engine: CAFFE
454
+ weight_filler {
455
+ type: "msra"
456
+ }
457
+ bias_filler {
458
+ type: "constant"
459
+ value: 0.0
460
+ }
461
+ }
462
+ }
463
+ layer {
464
+ name: "conv7/dw/relu"
465
+ type: "ReLU"
466
+ bottom: "conv7/dw"
467
+ top: "conv7/dw"
468
+ }
469
+ layer {
470
+ name: "conv7"
471
+ type: "Convolution"
472
+ bottom: "conv7/dw"
473
+ top: "conv7"
474
+ param {
475
+ lr_mult: 1.0
476
+ decay_mult: 1.0
477
+ }
478
+ param {
479
+ lr_mult: 2.0
480
+ decay_mult: 0.0
481
+ }
482
+ convolution_param {
483
+ num_output: 512
484
+ kernel_size: 1
485
+ weight_filler {
486
+ type: "msra"
487
+ }
488
+ bias_filler {
489
+ type: "constant"
490
+ value: 0.0
491
+ }
492
+ }
493
+ }
494
+ layer {
495
+ name: "conv7/relu"
496
+ type: "ReLU"
497
+ bottom: "conv7"
498
+ top: "conv7"
499
+ }
500
+ layer {
501
+ name: "conv8/dw"
502
+ type: "Convolution"
503
+ bottom: "conv7"
504
+ top: "conv8/dw"
505
+ param {
506
+ lr_mult: 1.0
507
+ decay_mult: 1.0
508
+ }
509
+ param {
510
+ lr_mult: 2.0
511
+ decay_mult: 0.0
512
+ }
513
+ convolution_param {
514
+ num_output: 512
515
+ pad: 1
516
+ kernel_size: 3
517
+ group: 512
518
+ engine: CAFFE
519
+ weight_filler {
520
+ type: "msra"
521
+ }
522
+ bias_filler {
523
+ type: "constant"
524
+ value: 0.0
525
+ }
526
+ }
527
+ }
528
+ layer {
529
+ name: "conv8/dw/relu"
530
+ type: "ReLU"
531
+ bottom: "conv8/dw"
532
+ top: "conv8/dw"
533
+ }
534
+ layer {
535
+ name: "conv8"
536
+ type: "Convolution"
537
+ bottom: "conv8/dw"
538
+ top: "conv8"
539
+ param {
540
+ lr_mult: 1.0
541
+ decay_mult: 1.0
542
+ }
543
+ param {
544
+ lr_mult: 2.0
545
+ decay_mult: 0.0
546
+ }
547
+ convolution_param {
548
+ num_output: 512
549
+ kernel_size: 1
550
+ weight_filler {
551
+ type: "msra"
552
+ }
553
+ bias_filler {
554
+ type: "constant"
555
+ value: 0.0
556
+ }
557
+ }
558
+ }
559
+ layer {
560
+ name: "conv8/relu"
561
+ type: "ReLU"
562
+ bottom: "conv8"
563
+ top: "conv8"
564
+ }
565
+ layer {
566
+ name: "conv9/dw"
567
+ type: "Convolution"
568
+ bottom: "conv8"
569
+ top: "conv9/dw"
570
+ param {
571
+ lr_mult: 1.0
572
+ decay_mult: 1.0
573
+ }
574
+ param {
575
+ lr_mult: 2.0
576
+ decay_mult: 0.0
577
+ }
578
+ convolution_param {
579
+ num_output: 512
580
+ pad: 1
581
+ kernel_size: 3
582
+ group: 512
583
+ engine: CAFFE
584
+ weight_filler {
585
+ type: "msra"
586
+ }
587
+ bias_filler {
588
+ type: "constant"
589
+ value: 0.0
590
+ }
591
+ }
592
+ }
593
+ layer {
594
+ name: "conv9/dw/relu"
595
+ type: "ReLU"
596
+ bottom: "conv9/dw"
597
+ top: "conv9/dw"
598
+ }
599
+ layer {
600
+ name: "conv9"
601
+ type: "Convolution"
602
+ bottom: "conv9/dw"
603
+ top: "conv9"
604
+ param {
605
+ lr_mult: 1.0
606
+ decay_mult: 1.0
607
+ }
608
+ param {
609
+ lr_mult: 2.0
610
+ decay_mult: 0.0
611
+ }
612
+ convolution_param {
613
+ num_output: 512
614
+ kernel_size: 1
615
+ weight_filler {
616
+ type: "msra"
617
+ }
618
+ bias_filler {
619
+ type: "constant"
620
+ value: 0.0
621
+ }
622
+ }
623
+ }
624
+ layer {
625
+ name: "conv9/relu"
626
+ type: "ReLU"
627
+ bottom: "conv9"
628
+ top: "conv9"
629
+ }
630
+ layer {
631
+ name: "conv10/dw"
632
+ type: "Convolution"
633
+ bottom: "conv9"
634
+ top: "conv10/dw"
635
+ param {
636
+ lr_mult: 1.0
637
+ decay_mult: 1.0
638
+ }
639
+ param {
640
+ lr_mult: 2.0
641
+ decay_mult: 0.0
642
+ }
643
+ convolution_param {
644
+ num_output: 512
645
+ pad: 1
646
+ kernel_size: 3
647
+ group: 512
648
+ engine: CAFFE
649
+ weight_filler {
650
+ type: "msra"
651
+ }
652
+ bias_filler {
653
+ type: "constant"
654
+ value: 0.0
655
+ }
656
+ }
657
+ }
658
+ layer {
659
+ name: "conv10/dw/relu"
660
+ type: "ReLU"
661
+ bottom: "conv10/dw"
662
+ top: "conv10/dw"
663
+ }
664
+ layer {
665
+ name: "conv10"
666
+ type: "Convolution"
667
+ bottom: "conv10/dw"
668
+ top: "conv10"
669
+ param {
670
+ lr_mult: 1.0
671
+ decay_mult: 1.0
672
+ }
673
+ param {
674
+ lr_mult: 2.0
675
+ decay_mult: 0.0
676
+ }
677
+ convolution_param {
678
+ num_output: 512
679
+ kernel_size: 1
680
+ weight_filler {
681
+ type: "msra"
682
+ }
683
+ bias_filler {
684
+ type: "constant"
685
+ value: 0.0
686
+ }
687
+ }
688
+ }
689
+ layer {
690
+ name: "conv10/relu"
691
+ type: "ReLU"
692
+ bottom: "conv10"
693
+ top: "conv10"
694
+ }
695
+ layer {
696
+ name: "conv11/dw"
697
+ type: "Convolution"
698
+ bottom: "conv10"
699
+ top: "conv11/dw"
700
+ param {
701
+ lr_mult: 1.0
702
+ decay_mult: 1.0
703
+ }
704
+ param {
705
+ lr_mult: 2.0
706
+ decay_mult: 0.0
707
+ }
708
+ convolution_param {
709
+ num_output: 512
710
+ pad: 1
711
+ kernel_size: 3
712
+ group: 512
713
+ engine: CAFFE
714
+ weight_filler {
715
+ type: "msra"
716
+ }
717
+ bias_filler {
718
+ type: "constant"
719
+ value: 0.0
720
+ }
721
+ }
722
+ }
723
+ layer {
724
+ name: "conv11/dw/relu"
725
+ type: "ReLU"
726
+ bottom: "conv11/dw"
727
+ top: "conv11/dw"
728
+ }
729
+ layer {
730
+ name: "conv11"
731
+ type: "Convolution"
732
+ bottom: "conv11/dw"
733
+ top: "conv11"
734
+ param {
735
+ lr_mult: 1.0
736
+ decay_mult: 1.0
737
+ }
738
+ param {
739
+ lr_mult: 2.0
740
+ decay_mult: 0.0
741
+ }
742
+ convolution_param {
743
+ num_output: 512
744
+ kernel_size: 1
745
+ weight_filler {
746
+ type: "msra"
747
+ }
748
+ bias_filler {
749
+ type: "constant"
750
+ value: 0.0
751
+ }
752
+ }
753
+ }
754
+ layer {
755
+ name: "conv11/relu"
756
+ type: "ReLU"
757
+ bottom: "conv11"
758
+ top: "conv11"
759
+ }
760
+ layer {
761
+ name: "conv12/dw"
762
+ type: "Convolution"
763
+ bottom: "conv11"
764
+ top: "conv12/dw"
765
+ param {
766
+ lr_mult: 1.0
767
+ decay_mult: 1.0
768
+ }
769
+ param {
770
+ lr_mult: 2.0
771
+ decay_mult: 0.0
772
+ }
773
+ convolution_param {
774
+ num_output: 512
775
+ pad: 1
776
+ kernel_size: 3
777
+ stride: 2
778
+ group: 512
779
+ engine: CAFFE
780
+ weight_filler {
781
+ type: "msra"
782
+ }
783
+ bias_filler {
784
+ type: "constant"
785
+ value: 0.0
786
+ }
787
+ }
788
+ }
789
+ layer {
790
+ name: "conv12/dw/relu"
791
+ type: "ReLU"
792
+ bottom: "conv12/dw"
793
+ top: "conv12/dw"
794
+ }
795
+ layer {
796
+ name: "conv12"
797
+ type: "Convolution"
798
+ bottom: "conv12/dw"
799
+ top: "conv12"
800
+ param {
801
+ lr_mult: 1.0
802
+ decay_mult: 1.0
803
+ }
804
+ param {
805
+ lr_mult: 2.0
806
+ decay_mult: 0.0
807
+ }
808
+ convolution_param {
809
+ num_output: 1024
810
+ kernel_size: 1
811
+ weight_filler {
812
+ type: "msra"
813
+ }
814
+ bias_filler {
815
+ type: "constant"
816
+ value: 0.0
817
+ }
818
+ }
819
+ }
820
+ layer {
821
+ name: "conv12/relu"
822
+ type: "ReLU"
823
+ bottom: "conv12"
824
+ top: "conv12"
825
+ }
826
+ layer {
827
+ name: "conv13/dw"
828
+ type: "Convolution"
829
+ bottom: "conv12"
830
+ top: "conv13/dw"
831
+ param {
832
+ lr_mult: 1.0
833
+ decay_mult: 1.0
834
+ }
835
+ param {
836
+ lr_mult: 2.0
837
+ decay_mult: 0.0
838
+ }
839
+ convolution_param {
840
+ num_output: 1024
841
+ pad: 1
842
+ kernel_size: 3
843
+ group: 1024
844
+ engine: CAFFE
845
+ weight_filler {
846
+ type: "msra"
847
+ }
848
+ bias_filler {
849
+ type: "constant"
850
+ value: 0.0
851
+ }
852
+ }
853
+ }
854
+ layer {
855
+ name: "conv13/dw/relu"
856
+ type: "ReLU"
857
+ bottom: "conv13/dw"
858
+ top: "conv13/dw"
859
+ }
860
+ layer {
861
+ name: "conv13"
862
+ type: "Convolution"
863
+ bottom: "conv13/dw"
864
+ top: "conv13"
865
+ param {
866
+ lr_mult: 1.0
867
+ decay_mult: 1.0
868
+ }
869
+ param {
870
+ lr_mult: 2.0
871
+ decay_mult: 0.0
872
+ }
873
+ convolution_param {
874
+ num_output: 1024
875
+ kernel_size: 1
876
+ weight_filler {
877
+ type: "msra"
878
+ }
879
+ bias_filler {
880
+ type: "constant"
881
+ value: 0.0
882
+ }
883
+ }
884
+ }
885
+ layer {
886
+ name: "conv13/relu"
887
+ type: "ReLU"
888
+ bottom: "conv13"
889
+ top: "conv13"
890
+ }
891
+ layer {
892
+ name: "conv14_1"
893
+ type: "Convolution"
894
+ bottom: "conv13"
895
+ top: "conv14_1"
896
+ param {
897
+ lr_mult: 1.0
898
+ decay_mult: 1.0
899
+ }
900
+ param {
901
+ lr_mult: 2.0
902
+ decay_mult: 0.0
903
+ }
904
+ convolution_param {
905
+ num_output: 256
906
+ kernel_size: 1
907
+ weight_filler {
908
+ type: "msra"
909
+ }
910
+ bias_filler {
911
+ type: "constant"
912
+ value: 0.0
913
+ }
914
+ }
915
+ }
916
+ layer {
917
+ name: "conv14_1/relu"
918
+ type: "ReLU"
919
+ bottom: "conv14_1"
920
+ top: "conv14_1"
921
+ }
922
+ layer {
923
+ name: "conv14_2"
924
+ type: "Convolution"
925
+ bottom: "conv14_1"
926
+ top: "conv14_2"
927
+ param {
928
+ lr_mult: 1.0
929
+ decay_mult: 1.0
930
+ }
931
+ param {
932
+ lr_mult: 2.0
933
+ decay_mult: 0.0
934
+ }
935
+ convolution_param {
936
+ num_output: 512
937
+ pad: 1
938
+ kernel_size: 3
939
+ stride: 2
940
+ weight_filler {
941
+ type: "msra"
942
+ }
943
+ bias_filler {
944
+ type: "constant"
945
+ value: 0.0
946
+ }
947
+ }
948
+ }
949
+ layer {
950
+ name: "conv14_2/relu"
951
+ type: "ReLU"
952
+ bottom: "conv14_2"
953
+ top: "conv14_2"
954
+ }
955
+ layer {
956
+ name: "conv15_1"
957
+ type: "Convolution"
958
+ bottom: "conv14_2"
959
+ top: "conv15_1"
960
+ param {
961
+ lr_mult: 1.0
962
+ decay_mult: 1.0
963
+ }
964
+ param {
965
+ lr_mult: 2.0
966
+ decay_mult: 0.0
967
+ }
968
+ convolution_param {
969
+ num_output: 128
970
+ kernel_size: 1
971
+ weight_filler {
972
+ type: "msra"
973
+ }
974
+ bias_filler {
975
+ type: "constant"
976
+ value: 0.0
977
+ }
978
+ }
979
+ }
980
+ layer {
981
+ name: "conv15_1/relu"
982
+ type: "ReLU"
983
+ bottom: "conv15_1"
984
+ top: "conv15_1"
985
+ }
986
+ layer {
987
+ name: "conv15_2"
988
+ type: "Convolution"
989
+ bottom: "conv15_1"
990
+ top: "conv15_2"
991
+ param {
992
+ lr_mult: 1.0
993
+ decay_mult: 1.0
994
+ }
995
+ param {
996
+ lr_mult: 2.0
997
+ decay_mult: 0.0
998
+ }
999
+ convolution_param {
1000
+ num_output: 256
1001
+ pad: 1
1002
+ kernel_size: 3
1003
+ stride: 2
1004
+ weight_filler {
1005
+ type: "msra"
1006
+ }
1007
+ bias_filler {
1008
+ type: "constant"
1009
+ value: 0.0
1010
+ }
1011
+ }
1012
+ }
1013
+ layer {
1014
+ name: "conv15_2/relu"
1015
+ type: "ReLU"
1016
+ bottom: "conv15_2"
1017
+ top: "conv15_2"
1018
+ }
1019
+ layer {
1020
+ name: "conv16_1"
1021
+ type: "Convolution"
1022
+ bottom: "conv15_2"
1023
+ top: "conv16_1"
1024
+ param {
1025
+ lr_mult: 1.0
1026
+ decay_mult: 1.0
1027
+ }
1028
+ param {
1029
+ lr_mult: 2.0
1030
+ decay_mult: 0.0
1031
+ }
1032
+ convolution_param {
1033
+ num_output: 128
1034
+ kernel_size: 1
1035
+ weight_filler {
1036
+ type: "msra"
1037
+ }
1038
+ bias_filler {
1039
+ type: "constant"
1040
+ value: 0.0
1041
+ }
1042
+ }
1043
+ }
1044
+ layer {
1045
+ name: "conv16_1/relu"
1046
+ type: "ReLU"
1047
+ bottom: "conv16_1"
1048
+ top: "conv16_1"
1049
+ }
1050
+ layer {
1051
+ name: "conv16_2"
1052
+ type: "Convolution"
1053
+ bottom: "conv16_1"
1054
+ top: "conv16_2"
1055
+ param {
1056
+ lr_mult: 1.0
1057
+ decay_mult: 1.0
1058
+ }
1059
+ param {
1060
+ lr_mult: 2.0
1061
+ decay_mult: 0.0
1062
+ }
1063
+ convolution_param {
1064
+ num_output: 256
1065
+ pad: 1
1066
+ kernel_size: 3
1067
+ stride: 2
1068
+ weight_filler {
1069
+ type: "msra"
1070
+ }
1071
+ bias_filler {
1072
+ type: "constant"
1073
+ value: 0.0
1074
+ }
1075
+ }
1076
+ }
1077
+ layer {
1078
+ name: "conv16_2/relu"
1079
+ type: "ReLU"
1080
+ bottom: "conv16_2"
1081
+ top: "conv16_2"
1082
+ }
1083
+ layer {
1084
+ name: "conv17_1"
1085
+ type: "Convolution"
1086
+ bottom: "conv16_2"
1087
+ top: "conv17_1"
1088
+ param {
1089
+ lr_mult: 1.0
1090
+ decay_mult: 1.0
1091
+ }
1092
+ param {
1093
+ lr_mult: 2.0
1094
+ decay_mult: 0.0
1095
+ }
1096
+ convolution_param {
1097
+ num_output: 64
1098
+ kernel_size: 1
1099
+ weight_filler {
1100
+ type: "msra"
1101
+ }
1102
+ bias_filler {
1103
+ type: "constant"
1104
+ value: 0.0
1105
+ }
1106
+ }
1107
+ }
1108
+ layer {
1109
+ name: "conv17_1/relu"
1110
+ type: "ReLU"
1111
+ bottom: "conv17_1"
1112
+ top: "conv17_1"
1113
+ }
1114
+ layer {
1115
+ name: "conv17_2"
1116
+ type: "Convolution"
1117
+ bottom: "conv17_1"
1118
+ top: "conv17_2"
1119
+ param {
1120
+ lr_mult: 1.0
1121
+ decay_mult: 1.0
1122
+ }
1123
+ param {
1124
+ lr_mult: 2.0
1125
+ decay_mult: 0.0
1126
+ }
1127
+ convolution_param {
1128
+ num_output: 128
1129
+ pad: 1
1130
+ kernel_size: 3
1131
+ stride: 2
1132
+ weight_filler {
1133
+ type: "msra"
1134
+ }
1135
+ bias_filler {
1136
+ type: "constant"
1137
+ value: 0.0
1138
+ }
1139
+ }
1140
+ }
1141
+ layer {
1142
+ name: "conv17_2/relu"
1143
+ type: "ReLU"
1144
+ bottom: "conv17_2"
1145
+ top: "conv17_2"
1146
+ }
1147
+ layer {
1148
+ name: "conv11_mbox_loc"
1149
+ type: "Convolution"
1150
+ bottom: "conv11"
1151
+ top: "conv11_mbox_loc"
1152
+ param {
1153
+ lr_mult: 1.0
1154
+ decay_mult: 1.0
1155
+ }
1156
+ param {
1157
+ lr_mult: 2.0
1158
+ decay_mult: 0.0
1159
+ }
1160
+ convolution_param {
1161
+ num_output: 12
1162
+ kernel_size: 1
1163
+ weight_filler {
1164
+ type: "msra"
1165
+ }
1166
+ bias_filler {
1167
+ type: "constant"
1168
+ value: 0.0
1169
+ }
1170
+ }
1171
+ }
1172
+ layer {
1173
+ name: "conv11_mbox_loc_perm"
1174
+ type: "Permute"
1175
+ bottom: "conv11_mbox_loc"
1176
+ top: "conv11_mbox_loc_perm"
1177
+ permute_param {
1178
+ order: 0
1179
+ order: 2
1180
+ order: 3
1181
+ order: 1
1182
+ }
1183
+ }
1184
+ layer {
1185
+ name: "conv11_mbox_loc_flat"
1186
+ type: "Flatten"
1187
+ bottom: "conv11_mbox_loc_perm"
1188
+ top: "conv11_mbox_loc_flat"
1189
+ flatten_param {
1190
+ axis: 1
1191
+ }
1192
+ }
1193
+ layer {
1194
+ name: "conv11_mbox_conf"
1195
+ type: "Convolution"
1196
+ bottom: "conv11"
1197
+ top: "conv11_mbox_conf"
1198
+ param {
1199
+ lr_mult: 1.0
1200
+ decay_mult: 1.0
1201
+ }
1202
+ param {
1203
+ lr_mult: 2.0
1204
+ decay_mult: 0.0
1205
+ }
1206
+ convolution_param {
1207
+ num_output: 63
1208
+ kernel_size: 1
1209
+ weight_filler {
1210
+ type: "msra"
1211
+ }
1212
+ bias_filler {
1213
+ type: "constant"
1214
+ value: 0.0
1215
+ }
1216
+ }
1217
+ }
1218
+ layer {
1219
+ name: "conv11_mbox_conf_perm"
1220
+ type: "Permute"
1221
+ bottom: "conv11_mbox_conf"
1222
+ top: "conv11_mbox_conf_perm"
1223
+ permute_param {
1224
+ order: 0
1225
+ order: 2
1226
+ order: 3
1227
+ order: 1
1228
+ }
1229
+ }
1230
+ layer {
1231
+ name: "conv11_mbox_conf_flat"
1232
+ type: "Flatten"
1233
+ bottom: "conv11_mbox_conf_perm"
1234
+ top: "conv11_mbox_conf_flat"
1235
+ flatten_param {
1236
+ axis: 1
1237
+ }
1238
+ }
1239
+ layer {
1240
+ name: "conv11_mbox_priorbox"
1241
+ type: "PriorBox"
1242
+ bottom: "conv11"
1243
+ bottom: "data"
1244
+ top: "conv11_mbox_priorbox"
1245
+ prior_box_param {
1246
+ min_size: 60.0
1247
+ aspect_ratio: 2.0
1248
+ flip: true
1249
+ clip: false
1250
+ variance: 0.1
1251
+ variance: 0.1
1252
+ variance: 0.2
1253
+ variance: 0.2
1254
+ offset: 0.5
1255
+ }
1256
+ }
1257
+ layer {
1258
+ name: "conv13_mbox_loc"
1259
+ type: "Convolution"
1260
+ bottom: "conv13"
1261
+ top: "conv13_mbox_loc"
1262
+ param {
1263
+ lr_mult: 1.0
1264
+ decay_mult: 1.0
1265
+ }
1266
+ param {
1267
+ lr_mult: 2.0
1268
+ decay_mult: 0.0
1269
+ }
1270
+ convolution_param {
1271
+ num_output: 24
1272
+ kernel_size: 1
1273
+ weight_filler {
1274
+ type: "msra"
1275
+ }
1276
+ bias_filler {
1277
+ type: "constant"
1278
+ value: 0.0
1279
+ }
1280
+ }
1281
+ }
1282
+ layer {
1283
+ name: "conv13_mbox_loc_perm"
1284
+ type: "Permute"
1285
+ bottom: "conv13_mbox_loc"
1286
+ top: "conv13_mbox_loc_perm"
1287
+ permute_param {
1288
+ order: 0
1289
+ order: 2
1290
+ order: 3
1291
+ order: 1
1292
+ }
1293
+ }
1294
+ layer {
1295
+ name: "conv13_mbox_loc_flat"
1296
+ type: "Flatten"
1297
+ bottom: "conv13_mbox_loc_perm"
1298
+ top: "conv13_mbox_loc_flat"
1299
+ flatten_param {
1300
+ axis: 1
1301
+ }
1302
+ }
1303
+ layer {
1304
+ name: "conv13_mbox_conf"
1305
+ type: "Convolution"
1306
+ bottom: "conv13"
1307
+ top: "conv13_mbox_conf"
1308
+ param {
1309
+ lr_mult: 1.0
1310
+ decay_mult: 1.0
1311
+ }
1312
+ param {
1313
+ lr_mult: 2.0
1314
+ decay_mult: 0.0
1315
+ }
1316
+ convolution_param {
1317
+ num_output: 126
1318
+ kernel_size: 1
1319
+ weight_filler {
1320
+ type: "msra"
1321
+ }
1322
+ bias_filler {
1323
+ type: "constant"
1324
+ value: 0.0
1325
+ }
1326
+ }
1327
+ }
1328
+ layer {
1329
+ name: "conv13_mbox_conf_perm"
1330
+ type: "Permute"
1331
+ bottom: "conv13_mbox_conf"
1332
+ top: "conv13_mbox_conf_perm"
1333
+ permute_param {
1334
+ order: 0
1335
+ order: 2
1336
+ order: 3
1337
+ order: 1
1338
+ }
1339
+ }
1340
+ layer {
1341
+ name: "conv13_mbox_conf_flat"
1342
+ type: "Flatten"
1343
+ bottom: "conv13_mbox_conf_perm"
1344
+ top: "conv13_mbox_conf_flat"
1345
+ flatten_param {
1346
+ axis: 1
1347
+ }
1348
+ }
1349
+ layer {
1350
+ name: "conv13_mbox_priorbox"
1351
+ type: "PriorBox"
1352
+ bottom: "conv13"
1353
+ bottom: "data"
1354
+ top: "conv13_mbox_priorbox"
1355
+ prior_box_param {
1356
+ min_size: 105.0
1357
+ max_size: 150.0
1358
+ aspect_ratio: 2.0
1359
+ aspect_ratio: 3.0
1360
+ flip: true
1361
+ clip: false
1362
+ variance: 0.1
1363
+ variance: 0.1
1364
+ variance: 0.2
1365
+ variance: 0.2
1366
+ offset: 0.5
1367
+ }
1368
+ }
1369
+ layer {
1370
+ name: "conv14_2_mbox_loc"
1371
+ type: "Convolution"
1372
+ bottom: "conv14_2"
1373
+ top: "conv14_2_mbox_loc"
1374
+ param {
1375
+ lr_mult: 1.0
1376
+ decay_mult: 1.0
1377
+ }
1378
+ param {
1379
+ lr_mult: 2.0
1380
+ decay_mult: 0.0
1381
+ }
1382
+ convolution_param {
1383
+ num_output: 24
1384
+ kernel_size: 1
1385
+ weight_filler {
1386
+ type: "msra"
1387
+ }
1388
+ bias_filler {
1389
+ type: "constant"
1390
+ value: 0.0
1391
+ }
1392
+ }
1393
+ }
1394
+ layer {
1395
+ name: "conv14_2_mbox_loc_perm"
1396
+ type: "Permute"
1397
+ bottom: "conv14_2_mbox_loc"
1398
+ top: "conv14_2_mbox_loc_perm"
1399
+ permute_param {
1400
+ order: 0
1401
+ order: 2
1402
+ order: 3
1403
+ order: 1
1404
+ }
1405
+ }
1406
+ layer {
1407
+ name: "conv14_2_mbox_loc_flat"
1408
+ type: "Flatten"
1409
+ bottom: "conv14_2_mbox_loc_perm"
1410
+ top: "conv14_2_mbox_loc_flat"
1411
+ flatten_param {
1412
+ axis: 1
1413
+ }
1414
+ }
1415
+ layer {
1416
+ name: "conv14_2_mbox_conf"
1417
+ type: "Convolution"
1418
+ bottom: "conv14_2"
1419
+ top: "conv14_2_mbox_conf"
1420
+ param {
1421
+ lr_mult: 1.0
1422
+ decay_mult: 1.0
1423
+ }
1424
+ param {
1425
+ lr_mult: 2.0
1426
+ decay_mult: 0.0
1427
+ }
1428
+ convolution_param {
1429
+ num_output: 126
1430
+ kernel_size: 1
1431
+ weight_filler {
1432
+ type: "msra"
1433
+ }
1434
+ bias_filler {
1435
+ type: "constant"
1436
+ value: 0.0
1437
+ }
1438
+ }
1439
+ }
1440
+ layer {
1441
+ name: "conv14_2_mbox_conf_perm"
1442
+ type: "Permute"
1443
+ bottom: "conv14_2_mbox_conf"
1444
+ top: "conv14_2_mbox_conf_perm"
1445
+ permute_param {
1446
+ order: 0
1447
+ order: 2
1448
+ order: 3
1449
+ order: 1
1450
+ }
1451
+ }
1452
+ layer {
1453
+ name: "conv14_2_mbox_conf_flat"
1454
+ type: "Flatten"
1455
+ bottom: "conv14_2_mbox_conf_perm"
1456
+ top: "conv14_2_mbox_conf_flat"
1457
+ flatten_param {
1458
+ axis: 1
1459
+ }
1460
+ }
1461
+ layer {
1462
+ name: "conv14_2_mbox_priorbox"
1463
+ type: "PriorBox"
1464
+ bottom: "conv14_2"
1465
+ bottom: "data"
1466
+ top: "conv14_2_mbox_priorbox"
1467
+ prior_box_param {
1468
+ min_size: 150.0
1469
+ max_size: 195.0
1470
+ aspect_ratio: 2.0
1471
+ aspect_ratio: 3.0
1472
+ flip: true
1473
+ clip: false
1474
+ variance: 0.1
1475
+ variance: 0.1
1476
+ variance: 0.2
1477
+ variance: 0.2
1478
+ offset: 0.5
1479
+ }
1480
+ }
1481
+ layer {
1482
+ name: "conv15_2_mbox_loc"
1483
+ type: "Convolution"
1484
+ bottom: "conv15_2"
1485
+ top: "conv15_2_mbox_loc"
1486
+ param {
1487
+ lr_mult: 1.0
1488
+ decay_mult: 1.0
1489
+ }
1490
+ param {
1491
+ lr_mult: 2.0
1492
+ decay_mult: 0.0
1493
+ }
1494
+ convolution_param {
1495
+ num_output: 24
1496
+ kernel_size: 1
1497
+ weight_filler {
1498
+ type: "msra"
1499
+ }
1500
+ bias_filler {
1501
+ type: "constant"
1502
+ value: 0.0
1503
+ }
1504
+ }
1505
+ }
1506
+ layer {
1507
+ name: "conv15_2_mbox_loc_perm"
1508
+ type: "Permute"
1509
+ bottom: "conv15_2_mbox_loc"
1510
+ top: "conv15_2_mbox_loc_perm"
1511
+ permute_param {
1512
+ order: 0
1513
+ order: 2
1514
+ order: 3
1515
+ order: 1
1516
+ }
1517
+ }
1518
+ layer {
1519
+ name: "conv15_2_mbox_loc_flat"
1520
+ type: "Flatten"
1521
+ bottom: "conv15_2_mbox_loc_perm"
1522
+ top: "conv15_2_mbox_loc_flat"
1523
+ flatten_param {
1524
+ axis: 1
1525
+ }
1526
+ }
1527
+ layer {
1528
+ name: "conv15_2_mbox_conf"
1529
+ type: "Convolution"
1530
+ bottom: "conv15_2"
1531
+ top: "conv15_2_mbox_conf"
1532
+ param {
1533
+ lr_mult: 1.0
1534
+ decay_mult: 1.0
1535
+ }
1536
+ param {
1537
+ lr_mult: 2.0
1538
+ decay_mult: 0.0
1539
+ }
1540
+ convolution_param {
1541
+ num_output: 126
1542
+ kernel_size: 1
1543
+ weight_filler {
1544
+ type: "msra"
1545
+ }
1546
+ bias_filler {
1547
+ type: "constant"
1548
+ value: 0.0
1549
+ }
1550
+ }
1551
+ }
1552
+ layer {
1553
+ name: "conv15_2_mbox_conf_perm"
1554
+ type: "Permute"
1555
+ bottom: "conv15_2_mbox_conf"
1556
+ top: "conv15_2_mbox_conf_perm"
1557
+ permute_param {
1558
+ order: 0
1559
+ order: 2
1560
+ order: 3
1561
+ order: 1
1562
+ }
1563
+ }
1564
+ layer {
1565
+ name: "conv15_2_mbox_conf_flat"
1566
+ type: "Flatten"
1567
+ bottom: "conv15_2_mbox_conf_perm"
1568
+ top: "conv15_2_mbox_conf_flat"
1569
+ flatten_param {
1570
+ axis: 1
1571
+ }
1572
+ }
1573
+ layer {
1574
+ name: "conv15_2_mbox_priorbox"
1575
+ type: "PriorBox"
1576
+ bottom: "conv15_2"
1577
+ bottom: "data"
1578
+ top: "conv15_2_mbox_priorbox"
1579
+ prior_box_param {
1580
+ min_size: 195.0
1581
+ max_size: 240.0
1582
+ aspect_ratio: 2.0
1583
+ aspect_ratio: 3.0
1584
+ flip: true
1585
+ clip: false
1586
+ variance: 0.1
1587
+ variance: 0.1
1588
+ variance: 0.2
1589
+ variance: 0.2
1590
+ offset: 0.5
1591
+ }
1592
+ }
1593
+ layer {
1594
+ name: "conv16_2_mbox_loc"
1595
+ type: "Convolution"
1596
+ bottom: "conv16_2"
1597
+ top: "conv16_2_mbox_loc"
1598
+ param {
1599
+ lr_mult: 1.0
1600
+ decay_mult: 1.0
1601
+ }
1602
+ param {
1603
+ lr_mult: 2.0
1604
+ decay_mult: 0.0
1605
+ }
1606
+ convolution_param {
1607
+ num_output: 24
1608
+ kernel_size: 1
1609
+ weight_filler {
1610
+ type: "msra"
1611
+ }
1612
+ bias_filler {
1613
+ type: "constant"
1614
+ value: 0.0
1615
+ }
1616
+ }
1617
+ }
1618
+ layer {
1619
+ name: "conv16_2_mbox_loc_perm"
1620
+ type: "Permute"
1621
+ bottom: "conv16_2_mbox_loc"
1622
+ top: "conv16_2_mbox_loc_perm"
1623
+ permute_param {
1624
+ order: 0
1625
+ order: 2
1626
+ order: 3
1627
+ order: 1
1628
+ }
1629
+ }
1630
+ layer {
1631
+ name: "conv16_2_mbox_loc_flat"
1632
+ type: "Flatten"
1633
+ bottom: "conv16_2_mbox_loc_perm"
1634
+ top: "conv16_2_mbox_loc_flat"
1635
+ flatten_param {
1636
+ axis: 1
1637
+ }
1638
+ }
1639
+ layer {
1640
+ name: "conv16_2_mbox_conf"
1641
+ type: "Convolution"
1642
+ bottom: "conv16_2"
1643
+ top: "conv16_2_mbox_conf"
1644
+ param {
1645
+ lr_mult: 1.0
1646
+ decay_mult: 1.0
1647
+ }
1648
+ param {
1649
+ lr_mult: 2.0
1650
+ decay_mult: 0.0
1651
+ }
1652
+ convolution_param {
1653
+ num_output: 126
1654
+ kernel_size: 1
1655
+ weight_filler {
1656
+ type: "msra"
1657
+ }
1658
+ bias_filler {
1659
+ type: "constant"
1660
+ value: 0.0
1661
+ }
1662
+ }
1663
+ }
1664
+ layer {
1665
+ name: "conv16_2_mbox_conf_perm"
1666
+ type: "Permute"
1667
+ bottom: "conv16_2_mbox_conf"
1668
+ top: "conv16_2_mbox_conf_perm"
1669
+ permute_param {
1670
+ order: 0
1671
+ order: 2
1672
+ order: 3
1673
+ order: 1
1674
+ }
1675
+ }
1676
+ layer {
1677
+ name: "conv16_2_mbox_conf_flat"
1678
+ type: "Flatten"
1679
+ bottom: "conv16_2_mbox_conf_perm"
1680
+ top: "conv16_2_mbox_conf_flat"
1681
+ flatten_param {
1682
+ axis: 1
1683
+ }
1684
+ }
1685
+ layer {
1686
+ name: "conv16_2_mbox_priorbox"
1687
+ type: "PriorBox"
1688
+ bottom: "conv16_2"
1689
+ bottom: "data"
1690
+ top: "conv16_2_mbox_priorbox"
1691
+ prior_box_param {
1692
+ min_size: 240.0
1693
+ max_size: 285.0
1694
+ aspect_ratio: 2.0
1695
+ aspect_ratio: 3.0
1696
+ flip: true
1697
+ clip: false
1698
+ variance: 0.1
1699
+ variance: 0.1
1700
+ variance: 0.2
1701
+ variance: 0.2
1702
+ offset: 0.5
1703
+ }
1704
+ }
1705
+ layer {
1706
+ name: "conv17_2_mbox_loc"
1707
+ type: "Convolution"
1708
+ bottom: "conv17_2"
1709
+ top: "conv17_2_mbox_loc"
1710
+ param {
1711
+ lr_mult: 1.0
1712
+ decay_mult: 1.0
1713
+ }
1714
+ param {
1715
+ lr_mult: 2.0
1716
+ decay_mult: 0.0
1717
+ }
1718
+ convolution_param {
1719
+ num_output: 24
1720
+ kernel_size: 1
1721
+ weight_filler {
1722
+ type: "msra"
1723
+ }
1724
+ bias_filler {
1725
+ type: "constant"
1726
+ value: 0.0
1727
+ }
1728
+ }
1729
+ }
1730
+ layer {
1731
+ name: "conv17_2_mbox_loc_perm"
1732
+ type: "Permute"
1733
+ bottom: "conv17_2_mbox_loc"
1734
+ top: "conv17_2_mbox_loc_perm"
1735
+ permute_param {
1736
+ order: 0
1737
+ order: 2
1738
+ order: 3
1739
+ order: 1
1740
+ }
1741
+ }
1742
+ layer {
1743
+ name: "conv17_2_mbox_loc_flat"
1744
+ type: "Flatten"
1745
+ bottom: "conv17_2_mbox_loc_perm"
1746
+ top: "conv17_2_mbox_loc_flat"
1747
+ flatten_param {
1748
+ axis: 1
1749
+ }
1750
+ }
1751
+ layer {
1752
+ name: "conv17_2_mbox_conf"
1753
+ type: "Convolution"
1754
+ bottom: "conv17_2"
1755
+ top: "conv17_2_mbox_conf"
1756
+ param {
1757
+ lr_mult: 1.0
1758
+ decay_mult: 1.0
1759
+ }
1760
+ param {
1761
+ lr_mult: 2.0
1762
+ decay_mult: 0.0
1763
+ }
1764
+ convolution_param {
1765
+ num_output: 126
1766
+ kernel_size: 1
1767
+ weight_filler {
1768
+ type: "msra"
1769
+ }
1770
+ bias_filler {
1771
+ type: "constant"
1772
+ value: 0.0
1773
+ }
1774
+ }
1775
+ }
1776
+ layer {
1777
+ name: "conv17_2_mbox_conf_perm"
1778
+ type: "Permute"
1779
+ bottom: "conv17_2_mbox_conf"
1780
+ top: "conv17_2_mbox_conf_perm"
1781
+ permute_param {
1782
+ order: 0
1783
+ order: 2
1784
+ order: 3
1785
+ order: 1
1786
+ }
1787
+ }
1788
+ layer {
1789
+ name: "conv17_2_mbox_conf_flat"
1790
+ type: "Flatten"
1791
+ bottom: "conv17_2_mbox_conf_perm"
1792
+ top: "conv17_2_mbox_conf_flat"
1793
+ flatten_param {
1794
+ axis: 1
1795
+ }
1796
+ }
1797
+ layer {
1798
+ name: "conv17_2_mbox_priorbox"
1799
+ type: "PriorBox"
1800
+ bottom: "conv17_2"
1801
+ bottom: "data"
1802
+ top: "conv17_2_mbox_priorbox"
1803
+ prior_box_param {
1804
+ min_size: 285.0
1805
+ max_size: 300.0
1806
+ aspect_ratio: 2.0
1807
+ aspect_ratio: 3.0
1808
+ flip: true
1809
+ clip: false
1810
+ variance: 0.1
1811
+ variance: 0.1
1812
+ variance: 0.2
1813
+ variance: 0.2
1814
+ offset: 0.5
1815
+ }
1816
+ }
1817
+ layer {
1818
+ name: "mbox_loc"
1819
+ type: "Concat"
1820
+ bottom: "conv11_mbox_loc_flat"
1821
+ bottom: "conv13_mbox_loc_flat"
1822
+ bottom: "conv14_2_mbox_loc_flat"
1823
+ bottom: "conv15_2_mbox_loc_flat"
1824
+ bottom: "conv16_2_mbox_loc_flat"
1825
+ bottom: "conv17_2_mbox_loc_flat"
1826
+ top: "mbox_loc"
1827
+ concat_param {
1828
+ axis: 1
1829
+ }
1830
+ }
1831
+ layer {
1832
+ name: "mbox_conf"
1833
+ type: "Concat"
1834
+ bottom: "conv11_mbox_conf_flat"
1835
+ bottom: "conv13_mbox_conf_flat"
1836
+ bottom: "conv14_2_mbox_conf_flat"
1837
+ bottom: "conv15_2_mbox_conf_flat"
1838
+ bottom: "conv16_2_mbox_conf_flat"
1839
+ bottom: "conv17_2_mbox_conf_flat"
1840
+ top: "mbox_conf"
1841
+ concat_param {
1842
+ axis: 1
1843
+ }
1844
+ }
1845
+ layer {
1846
+ name: "mbox_priorbox"
1847
+ type: "Concat"
1848
+ bottom: "conv11_mbox_priorbox"
1849
+ bottom: "conv13_mbox_priorbox"
1850
+ bottom: "conv14_2_mbox_priorbox"
1851
+ bottom: "conv15_2_mbox_priorbox"
1852
+ bottom: "conv16_2_mbox_priorbox"
1853
+ bottom: "conv17_2_mbox_priorbox"
1854
+ top: "mbox_priorbox"
1855
+ concat_param {
1856
+ axis: 2
1857
+ }
1858
+ }
1859
+ layer {
1860
+ name: "mbox_conf_reshape"
1861
+ type: "Reshape"
1862
+ bottom: "mbox_conf"
1863
+ top: "mbox_conf_reshape"
1864
+ reshape_param {
1865
+ shape {
1866
+ dim: 0
1867
+ dim: -1
1868
+ dim: 21
1869
+ }
1870
+ }
1871
+ }
1872
+ layer {
1873
+ name: "mbox_conf_softmax"
1874
+ type: "Softmax"
1875
+ bottom: "mbox_conf_reshape"
1876
+ top: "mbox_conf_softmax"
1877
+ softmax_param {
1878
+ axis: 2
1879
+ }
1880
+ }
1881
+ layer {
1882
+ name: "mbox_conf_flatten"
1883
+ type: "Flatten"
1884
+ bottom: "mbox_conf_softmax"
1885
+ top: "mbox_conf_flatten"
1886
+ flatten_param {
1887
+ axis: 1
1888
+ }
1889
+ }
1890
+ layer {
1891
+ name: "detection_out"
1892
+ type: "DetectionOutput"
1893
+ bottom: "mbox_loc"
1894
+ bottom: "mbox_conf_flatten"
1895
+ bottom: "mbox_priorbox"
1896
+ top: "detection_out"
1897
+ include {
1898
+ phase: TEST
1899
+ }
1900
+ detection_output_param {
1901
+ num_classes: 21
1902
+ share_location: true
1903
+ background_label_id: 0
1904
+ nms_param {
1905
+ nms_threshold: 0.45
1906
+ top_k: 100
1907
+ }
1908
+ code_type: CENTER_SIZE
1909
+ keep_top_k: 100
1910
+ confidence_threshold: 0.25
1911
+ }
1912
+ }
roi/pooler.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+ import torch
4
+ from torch import Tensor
5
+ from torch.nn import functional as F
6
+
7
+ # from support.layer.roi_align import ROIAlign
8
+ from torchvision.ops import RoIAlign as ROIAlign
9
+
10
+
11
+ class Pooler(object):
12
+
13
+ class Mode(Enum):
14
+ POOLING = 'pooling'
15
+ ALIGN = 'align'
16
+
17
+ OPTIONS = ['pooling', 'align']
18
+
19
+ @staticmethod
20
+ def apply(features: Tensor, proposal_bboxes: Tensor, proposal_batch_indices: Tensor, mode: Mode) -> Tensor:
21
+ _, _, feature_map_height, feature_map_width = features.shape
22
+ scale = 1 / 16
23
+ output_size = (7 * 2, 7 * 2)
24
+
25
+ if mode == Pooler.Mode.POOLING:
26
+ pool = []
27
+ for (proposal_bbox, proposal_batch_index) in zip(proposal_bboxes, proposal_batch_indices):
28
+ start_x = max(min(round(proposal_bbox[0].item() * scale), feature_map_width - 1), 0) # [0, feature_map_width)
29
+ start_y = max(min(round(proposal_bbox[1].item() * scale), feature_map_height - 1), 0) # (0, feature_map_height]
30
+ end_x = max(min(round(proposal_bbox[2].item() * scale) + 1, feature_map_width), 1) # [0, feature_map_width)
31
+ end_y = max(min(round(proposal_bbox[3].item() * scale) + 1, feature_map_height), 1) # (0, feature_map_height]
32
+ roi_feature_map = features[proposal_batch_index, :, start_y:end_y, start_x:end_x]
33
+ pool.append(F.adaptive_max_pool2d(input=roi_feature_map, output_size=output_size))
34
+ pool = torch.stack(pool, dim=0)
35
+ elif mode == Pooler.Mode.ALIGN:
36
+ pool = ROIAlign(output_size, spatial_scale=scale, sampling_ratio=0)(
37
+ features,
38
+ torch.cat([proposal_batch_indices.view(-1, 1).float(), proposal_bboxes], dim=1)
39
+ )
40
+ else:
41
+ raise ValueError
42
+
43
+ pool = F.max_pool2d(input=pool, kernel_size=2, stride=2)
44
+ return pool
45
+
rpn/region_proposal_network.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, List, Optional, Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn, Tensor
6
+ from torch.nn import functional as F
7
+
8
+ from bbox import BBox
9
+ from extension.functional import beta_smooth_l1_loss
10
+ from torchvision.ops import nms
11
+
12
+
13
+ class RegionProposalNetwork(nn.Module):
14
+
15
+ def __init__(self, num_features_out: int, anchor_ratios: List[Tuple[int, int]], anchor_sizes: List[int],
16
+ pre_nms_top_n: int, post_nms_top_n: int, anchor_smooth_l1_loss_beta: float):
17
+ super().__init__()
18
+
19
+ self._features = nn.Sequential(
20
+ nn.Conv2d(in_channels=num_features_out, out_channels=512, kernel_size=3, padding=1),
21
+ nn.ReLU()
22
+ )
23
+
24
+ self._anchor_ratios = anchor_ratios
25
+ self._anchor_sizes = anchor_sizes
26
+
27
+ num_anchor_ratios = len(self._anchor_ratios)
28
+ num_anchor_sizes = len(self._anchor_sizes)
29
+ num_anchors = num_anchor_ratios * num_anchor_sizes
30
+
31
+ self._pre_nms_top_n = pre_nms_top_n
32
+ self._post_nms_top_n = post_nms_top_n
33
+ self._anchor_smooth_l1_loss_beta = anchor_smooth_l1_loss_beta
34
+
35
+ self._anchor_objectness = nn.Conv2d(in_channels=512, out_channels=num_anchors * 2, kernel_size=1)
36
+ self._anchor_transformer = nn.Conv2d(in_channels=512, out_channels=num_anchors * 4, kernel_size=1)
37
+
38
+ def forward(self, features: Tensor,
39
+ anchor_bboxes: Optional[Tensor] = None, gt_bboxes_batch: Optional[Tensor] = None,
40
+ image_width: Optional[int]=None, image_height: Optional[int]=None) -> Union[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, Tensor]]:
41
+ batch_size = features.shape[0]
42
+
43
+ features = self._features(features)
44
+ anchor_objectnesses = self._anchor_objectness(features)
45
+ anchor_transformers = self._anchor_transformer(features)
46
+
47
+ anchor_objectnesses = anchor_objectnesses.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)
48
+ anchor_transformers = anchor_transformers.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)
49
+
50
+ if not self.training:
51
+ return anchor_objectnesses, anchor_transformers
52
+ else:
53
+ # remove cross-boundary
54
+ # NOTE: The length of `inside_indices` is guaranteed to be a multiple of `anchor_bboxes.shape[0]` as each batch in `anchor_bboxes` is the same
55
+ inside_indices = BBox.inside(anchor_bboxes, left=0, top=0, right=image_width, bottom=image_height).nonzero().unbind(dim=1)
56
+ inside_anchor_bboxes = anchor_bboxes[inside_indices].view(batch_size, -1, anchor_bboxes.shape[2])
57
+ inside_anchor_objectnesses = anchor_objectnesses[inside_indices].view(batch_size, -1, anchor_objectnesses.shape[2])
58
+ inside_anchor_transformers = anchor_transformers[inside_indices].view(batch_size, -1, anchor_transformers.shape[2])
59
+
60
+ # find labels for each `anchor_bboxes`
61
+ labels = torch.full((batch_size, inside_anchor_bboxes.shape[1]), -1, dtype=torch.long, device=inside_anchor_bboxes.device)
62
+ ious = BBox.iou(inside_anchor_bboxes, gt_bboxes_batch)
63
+ anchor_max_ious, anchor_assignments = ious.max(dim=2)
64
+ gt_max_ious, gt_assignments = ious.max(dim=1)
65
+ anchor_additions = ((ious > 0) & (ious == gt_max_ious.unsqueeze(dim=1))).nonzero()[:, :2].unbind(dim=1)
66
+ labels[anchor_max_ious < 0.3] = 0
67
+ labels[anchor_additions] = 1
68
+ labels[anchor_max_ious >= 0.7] = 1
69
+
70
+ # select 256 x `batch_size` samples
71
+ fg_indices = (labels == 1).nonzero()
72
+ bg_indices = (labels == 0).nonzero()
73
+ fg_indices = fg_indices[torch.randperm(len(fg_indices))[:min(len(fg_indices), 256 * batch_size)]]
74
+ bg_indices = bg_indices[torch.randperm(len(bg_indices))[:256 * batch_size - len(fg_indices)]]
75
+ selected_indices = torch.cat([fg_indices, bg_indices], dim=0)
76
+ selected_indices = selected_indices[torch.randperm(len(selected_indices))].unbind(dim=1)
77
+
78
+ inside_anchor_bboxes = inside_anchor_bboxes[selected_indices]
79
+ gt_bboxes = gt_bboxes_batch[selected_indices[0], anchor_assignments[selected_indices]]
80
+ gt_anchor_objectnesses = labels[selected_indices]
81
+ gt_anchor_transformers = BBox.calc_transformer(inside_anchor_bboxes, gt_bboxes)
82
+ batch_indices = selected_indices[0]
83
+
84
+ anchor_objectness_losses, anchor_transformer_losses = self.loss(inside_anchor_objectnesses[selected_indices],
85
+ inside_anchor_transformers[selected_indices],
86
+ gt_anchor_objectnesses,
87
+ gt_anchor_transformers,
88
+ batch_size, batch_indices)
89
+
90
+ return anchor_objectnesses, anchor_transformers, anchor_objectness_losses, anchor_transformer_losses
91
+
92
+ def loss(self, anchor_objectnesses: Tensor, anchor_transformers: Tensor,
93
+ gt_anchor_objectnesses: Tensor, gt_anchor_transformers: Tensor,
94
+ batch_size: int, batch_indices: Tensor) -> Tuple[Tensor, Tensor]:
95
+ cross_entropies = torch.empty(batch_size, dtype=torch.float, device=anchor_objectnesses.device)
96
+ smooth_l1_losses = torch.empty(batch_size, dtype=torch.float, device=anchor_transformers.device)
97
+
98
+ for batch_index in range(batch_size):
99
+ selected_indices = (batch_indices == batch_index).nonzero().view(-1)
100
+
101
+ cross_entropy = F.cross_entropy(input=anchor_objectnesses[selected_indices],
102
+ target=gt_anchor_objectnesses[selected_indices])
103
+
104
+ fg_indices = gt_anchor_objectnesses[selected_indices].nonzero().view(-1)
105
+ smooth_l1_loss = beta_smooth_l1_loss(input=anchor_transformers[selected_indices][fg_indices],
106
+ target=gt_anchor_transformers[selected_indices][fg_indices],
107
+ beta=self._anchor_smooth_l1_loss_beta)
108
+
109
+ cross_entropies[batch_index] = cross_entropy
110
+ smooth_l1_losses[batch_index] = smooth_l1_loss
111
+
112
+ return cross_entropies, smooth_l1_losses
113
+
114
+ def generate_anchors(self, image_width: int, image_height: int, num_x_anchors: int, num_y_anchors: int) -> Tensor:
115
+ center_ys = np.linspace(start=0, stop=image_height, num=num_y_anchors + 2)[1:-1]
116
+ center_xs = np.linspace(start=0, stop=image_width, num=num_x_anchors + 2)[1:-1]
117
+ ratios = np.array(self._anchor_ratios)
118
+ ratios = ratios[:, 0] / ratios[:, 1]
119
+ sizes = np.array(self._anchor_sizes)
120
+
121
+ # NOTE: it's important to let `center_ys` be the major index (i.e., move horizontally and then vertically) for consistency with 2D convolution
122
+ # giving the string 'ij' returns a meshgrid with matrix indexing, i.e., with shape (#center_ys, #center_xs, #ratios)
123
+ center_ys, center_xs, ratios, sizes = np.meshgrid(center_ys, center_xs, ratios, sizes, indexing='ij')
124
+
125
+ center_ys = center_ys.reshape(-1)
126
+ center_xs = center_xs.reshape(-1)
127
+ ratios = ratios.reshape(-1)
128
+ sizes = sizes.reshape(-1)
129
+
130
+ widths = sizes * np.sqrt(1 / ratios)
131
+ heights = sizes * np.sqrt(ratios)
132
+
133
+ center_based_anchor_bboxes = np.stack((center_xs, center_ys, widths, heights), axis=1)
134
+ center_based_anchor_bboxes = torch.from_numpy(center_based_anchor_bboxes).float()
135
+ anchor_bboxes = BBox.from_center_base(center_based_anchor_bboxes)
136
+
137
+ return anchor_bboxes
138
+
139
+ def generate_proposals(self, anchor_bboxes: Tensor, objectnesses: Tensor, transformers: Tensor, image_width: int, image_height: int) -> Tensor:
140
+ batch_size = anchor_bboxes.shape[0]
141
+
142
+ proposal_bboxes = BBox.apply_transformer(anchor_bboxes, transformers)
143
+ proposal_bboxes = BBox.clip(proposal_bboxes, left=0, top=0, right=image_width, bottom=image_height)
144
+ proposal_probs = F.softmax(objectnesses[:, :, 1], dim=-1)
145
+
146
+ _, sorted_indices = torch.sort(proposal_probs, dim=-1, descending=True)
147
+ nms_proposal_bboxes_batch = []
148
+
149
+ for batch_index in range(batch_size):
150
+ sorted_bboxes = proposal_bboxes[batch_index][sorted_indices[batch_index]][:self._pre_nms_top_n]
151
+ sorted_probs = proposal_probs[batch_index][sorted_indices[batch_index]][:self._pre_nms_top_n]
152
+ threshold = 0.7
153
+ kept_indices = nms(sorted_bboxes, sorted_probs, threshold)
154
+ nms_bboxes = sorted_bboxes[kept_indices][:self._post_nms_top_n]
155
+ nms_proposal_bboxes_batch.append(nms_bboxes)
156
+
157
+ max_nms_proposal_bboxes_length = max([len(it) for it in nms_proposal_bboxes_batch])
158
+ padded_proposal_bboxes = []
159
+
160
+ for nms_proposal_bboxes in nms_proposal_bboxes_batch:
161
+ padded_proposal_bboxes.append(
162
+ torch.cat([
163
+ nms_proposal_bboxes,
164
+ torch.zeros(max_nms_proposal_bboxes_length - len(nms_proposal_bboxes), 4).to(nms_proposal_bboxes)
165
+ ])
166
+ )
167
+
168
+ padded_proposal_bboxes = torch.stack(padded_proposal_bboxes, dim=0)
169
+ return padded_proposal_bboxes