|
""" |
|
Transforms and data augmentation for sequence level images, bboxes and masks. |
|
""" |
|
import random |
|
|
|
import PIL |
|
import torch |
|
import torchvision.transforms as T |
|
import torchvision.transforms.functional as F |
|
|
|
from util.box_ops import box_xyxy_to_cxcywh, box_iou |
|
from util.misc import interpolate |
|
import numpy as np |
|
from numpy import random as rand |
|
from PIL import Image |
|
import cv2 |
|
|
|
|
|
|
|
class Check(object): |
|
def __init__(self,): |
|
pass |
|
def __call__(self, img, target): |
|
fields = ["labels"] |
|
if "boxes" in target: |
|
fields.append("boxes") |
|
if "masks" in target: |
|
fields.append("masks") |
|
|
|
|
|
if "boxes" in target or "masks" in target: |
|
if "boxes" in target: |
|
cropped_boxes = target['boxes'].reshape(-1, 2, 2) |
|
keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) |
|
else: |
|
keep = target['masks'].flatten(1).any(1) |
|
|
|
if False in keep: |
|
for k in range(len(keep)): |
|
if not keep[k] and "boxes" in target: |
|
target['boxes'][k] = target['boxes'][k]//1000.0 |
|
|
|
target['valid'] = keep.to(torch.int32) |
|
|
|
return img, target |
|
|
|
|
|
|
|
def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6): |
|
assert mode in ['iou', 'iof'] |
|
bboxes1 = bboxes1.astype(np.float32) |
|
bboxes2 = bboxes2.astype(np.float32) |
|
rows = bboxes1.shape[0] |
|
cols = bboxes2.shape[0] |
|
ious = np.zeros((rows, cols), dtype=np.float32) |
|
if rows * cols == 0: |
|
return ious |
|
exchange = False |
|
if bboxes1.shape[0] > bboxes2.shape[0]: |
|
bboxes1, bboxes2 = bboxes2, bboxes1 |
|
ious = np.zeros((cols, rows), dtype=np.float32) |
|
exchange = True |
|
area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) |
|
area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) |
|
for i in range(bboxes1.shape[0]): |
|
x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) |
|
y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) |
|
x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) |
|
y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) |
|
overlap = np.maximum(x_end - x_start, 0) * np.maximum(y_end - y_start, 0) |
|
if mode == 'iou': |
|
union = area1[i] + area2 - overlap |
|
else: |
|
union = area1[i] if not exchange else area2 |
|
union = np.maximum(union, eps) |
|
ious[i, :] = overlap / union |
|
if exchange: |
|
ious = ious.T |
|
return ious |
|
|
|
|
|
def crop(clip, target, region): |
|
cropped_image = [] |
|
for image in clip: |
|
cropped_image.append(F.crop(image, *region)) |
|
|
|
target = target.copy() |
|
i, j, h, w = region |
|
|
|
|
|
target["size"] = torch.tensor([h, w]) |
|
|
|
fields = ["labels", "area", "iscrowd"] |
|
|
|
if "boxes" in target: |
|
boxes = target["boxes"] |
|
max_size = torch.as_tensor([w, h], dtype=torch.float32) |
|
cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) |
|
cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) |
|
cropped_boxes = cropped_boxes.clamp(min=0) |
|
area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) |
|
target["boxes"] = cropped_boxes.reshape(-1, 4) |
|
target["area"] = area |
|
fields.append("boxes") |
|
|
|
if "masks" in target: |
|
|
|
target['masks'] = target['masks'][:, i:i + h, j:j + w] |
|
fields.append("masks") |
|
|
|
return cropped_image, target |
|
|
|
|
|
def hflip(clip, target): |
|
flipped_image = [] |
|
for image in clip: |
|
flipped_image.append(F.hflip(image)) |
|
|
|
w, h = clip[0].size |
|
|
|
target = target.copy() |
|
if "boxes" in target: |
|
boxes = target["boxes"] |
|
boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) |
|
target["boxes"] = boxes |
|
|
|
if "masks" in target: |
|
target['masks'] = target['masks'].flip(-1) |
|
|
|
return flipped_image, target |
|
|
|
def vflip(image,target): |
|
flipped_image = [] |
|
for image in clip: |
|
flipped_image.append(F.vflip(image)) |
|
w, h = clip[0].size |
|
target = target.copy() |
|
if "boxes" in target: |
|
boxes = target["boxes"] |
|
boxes = boxes[:, [0, 3, 2, 1]] * torch.as_tensor([1, -1, 1, -1]) + torch.as_tensor([0, h, 0, h]) |
|
target["boxes"] = boxes |
|
|
|
if "masks" in target: |
|
target['masks'] = target['masks'].flip(1) |
|
|
|
return flipped_image, target |
|
|
|
def resize(clip, target, size, max_size=None): |
|
|
|
|
|
def get_size_with_aspect_ratio(image_size, size, max_size=None): |
|
w, h = image_size |
|
if max_size is not None: |
|
min_original_size = float(min((w, h))) |
|
max_original_size = float(max((w, h))) |
|
if max_original_size / min_original_size * size > max_size: |
|
size = int(round(max_size * min_original_size / max_original_size)) |
|
|
|
if (w <= h and w == size) or (h <= w and h == size): |
|
return (h, w) |
|
|
|
if w < h: |
|
ow = size |
|
oh = int(size * h / w) |
|
else: |
|
oh = size |
|
ow = int(size * w / h) |
|
|
|
return (oh, ow) |
|
|
|
def get_size(image_size, size, max_size=None): |
|
if isinstance(size, (list, tuple)): |
|
return size[::-1] |
|
else: |
|
return get_size_with_aspect_ratio(image_size, size, max_size) |
|
|
|
size = get_size(clip[0].size, size, max_size) |
|
rescaled_image = [] |
|
for image in clip: |
|
rescaled_image.append(F.resize(image, size)) |
|
|
|
if target is None: |
|
return rescaled_image, None |
|
|
|
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image[0].size, clip[0].size)) |
|
ratio_width, ratio_height = ratios |
|
|
|
target = target.copy() |
|
if "boxes" in target: |
|
boxes = target["boxes"] |
|
scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) |
|
target["boxes"] = scaled_boxes |
|
|
|
if "area" in target: |
|
area = target["area"] |
|
scaled_area = area * (ratio_width * ratio_height) |
|
target["area"] = scaled_area |
|
|
|
h, w = size |
|
target["size"] = torch.tensor([h, w]) |
|
|
|
if "masks" in target: |
|
if target['masks'].shape[0]>0: |
|
target['masks'] = interpolate( |
|
target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 |
|
else: |
|
target['masks'] = torch.zeros((target['masks'].shape[0],h,w)) |
|
return rescaled_image, target |
|
|
|
|
|
def pad(clip, target, padding): |
|
|
|
padded_image = [] |
|
for image in clip: |
|
padded_image.append(F.pad(image, (0, 0, padding[0], padding[1]))) |
|
if target is None: |
|
return padded_image, None |
|
target = target.copy() |
|
|
|
target["size"] = torch.tensor(padded_image[0].size[::-1]) |
|
if "masks" in target: |
|
target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) |
|
return padded_image, target |
|
|
|
|
|
class RandomCrop(object): |
|
def __init__(self, size): |
|
self.size = size |
|
|
|
def __call__(self, img, target): |
|
region = T.RandomCrop.get_params(img, self.size) |
|
return crop(img, target, region) |
|
|
|
|
|
class RandomSizeCrop(object): |
|
def __init__(self, min_size: int, max_size: int): |
|
self.min_size = min_size |
|
self.max_size = max_size |
|
|
|
def __call__(self, img: PIL.Image.Image, target: dict): |
|
w = random.randint(self.min_size, min(img[0].width, self.max_size)) |
|
h = random.randint(self.min_size, min(img[0].height, self.max_size)) |
|
region = T.RandomCrop.get_params(img[0], [h, w]) |
|
return crop(img, target, region) |
|
|
|
|
|
class CenterCrop(object): |
|
def __init__(self, size): |
|
self.size = size |
|
|
|
def __call__(self, img, target): |
|
image_width, image_height = img.size |
|
crop_height, crop_width = self.size |
|
crop_top = int(round((image_height - crop_height) / 2.)) |
|
crop_left = int(round((image_width - crop_width) / 2.)) |
|
return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) |
|
|
|
|
|
class MinIoURandomCrop(object): |
|
def __init__(self, min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3): |
|
self.min_ious = min_ious |
|
self.sample_mode = (1, *min_ious, 0) |
|
self.min_crop_size = min_crop_size |
|
|
|
def __call__(self, img, target): |
|
w,h = img.size |
|
while True: |
|
mode = random.choice(self.sample_mode) |
|
self.mode = mode |
|
if mode == 1: |
|
return img,target |
|
min_iou = mode |
|
boxes = target['boxes'].numpy() |
|
labels = target['labels'] |
|
|
|
for i in range(50): |
|
new_w = rand.uniform(self.min_crop_size * w, w) |
|
new_h = rand.uniform(self.min_crop_size * h, h) |
|
if new_h / new_w < 0.5 or new_h / new_w > 2: |
|
continue |
|
left = rand.uniform(w - new_w) |
|
top = rand.uniform(h - new_h) |
|
patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h))) |
|
if patch[2] == patch[0] or patch[3] == patch[1]: |
|
continue |
|
overlaps = bbox_overlaps(patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1) |
|
if len(overlaps) > 0 and overlaps.min() < min_iou: |
|
continue |
|
|
|
if len(overlaps) > 0: |
|
def is_center_of_bboxes_in_patch(boxes, patch): |
|
center = (boxes[:, :2] + boxes[:, 2:]) / 2 |
|
mask = ((center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (center[:, 1] < patch[3])) |
|
return mask |
|
mask = is_center_of_bboxes_in_patch(boxes, patch) |
|
if False in mask: |
|
continue |
|
|
|
|
|
|
|
|
|
boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) |
|
boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) |
|
boxes -= np.tile(patch[:2], 2) |
|
target['boxes'] = torch.tensor(boxes) |
|
|
|
img = np.asarray(img)[patch[1]:patch[3], patch[0]:patch[2]] |
|
img = Image.fromarray(img) |
|
width, height = img.size |
|
target['orig_size'] = torch.tensor([height,width]) |
|
target['size'] = torch.tensor([height,width]) |
|
return img,target |
|
|
|
|
|
class RandomContrast(object): |
|
def __init__(self, lower=0.5, upper=1.5): |
|
self.lower = lower |
|
self.upper = upper |
|
assert self.upper >= self.lower, "contrast upper must be >= lower." |
|
assert self.lower >= 0, "contrast lower must be non-negative." |
|
def __call__(self, image, target): |
|
|
|
if rand.randint(2): |
|
alpha = rand.uniform(self.lower, self.upper) |
|
image *= alpha |
|
return image, target |
|
|
|
class RandomBrightness(object): |
|
def __init__(self, delta=32): |
|
assert delta >= 0.0 |
|
assert delta <= 255.0 |
|
self.delta = delta |
|
def __call__(self, image, target): |
|
if rand.randint(2): |
|
delta = rand.uniform(-self.delta, self.delta) |
|
image += delta |
|
return image, target |
|
|
|
class RandomSaturation(object): |
|
def __init__(self, lower=0.5, upper=1.5): |
|
self.lower = lower |
|
self.upper = upper |
|
assert self.upper >= self.lower, "contrast upper must be >= lower." |
|
assert self.lower >= 0, "contrast lower must be non-negative." |
|
|
|
def __call__(self, image, target): |
|
if rand.randint(2): |
|
image[:, :, 1] *= rand.uniform(self.lower, self.upper) |
|
return image, target |
|
|
|
class RandomHue(object): |
|
def __init__(self, delta=18.0): |
|
assert delta >= 0.0 and delta <= 360.0 |
|
self.delta = delta |
|
|
|
def __call__(self, image, target): |
|
if rand.randint(2): |
|
image[:, :, 0] += rand.uniform(-self.delta, self.delta) |
|
image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 |
|
image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 |
|
return image, target |
|
|
|
class RandomLightingNoise(object): |
|
def __init__(self): |
|
self.perms = ((0, 1, 2), (0, 2, 1), |
|
(1, 0, 2), (1, 2, 0), |
|
(2, 0, 1), (2, 1, 0)) |
|
def __call__(self, image, target): |
|
if rand.randint(2): |
|
swap = self.perms[rand.randint(len(self.perms))] |
|
shuffle = SwapChannels(swap) |
|
image = shuffle(image) |
|
return image, target |
|
|
|
class ConvertColor(object): |
|
def __init__(self, current='BGR', transform='HSV'): |
|
self.transform = transform |
|
self.current = current |
|
|
|
def __call__(self, image, target): |
|
if self.current == 'BGR' and self.transform == 'HSV': |
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) |
|
elif self.current == 'HSV' and self.transform == 'BGR': |
|
image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) |
|
else: |
|
raise NotImplementedError |
|
return image, target |
|
|
|
class SwapChannels(object): |
|
def __init__(self, swaps): |
|
self.swaps = swaps |
|
def __call__(self, image): |
|
image = image[:, :, self.swaps] |
|
return image |
|
|
|
class PhotometricDistort(object): |
|
def __init__(self): |
|
self.pd = [ |
|
RandomContrast(), |
|
ConvertColor(transform='HSV'), |
|
RandomSaturation(), |
|
RandomHue(), |
|
ConvertColor(current='HSV', transform='BGR'), |
|
RandomContrast() |
|
] |
|
self.rand_brightness = RandomBrightness() |
|
self.rand_light_noise = RandomLightingNoise() |
|
|
|
def __call__(self,clip,target): |
|
imgs = [] |
|
for img in clip: |
|
img = np.asarray(img).astype('float32') |
|
img, target = self.rand_brightness(img, target) |
|
if rand.randint(2): |
|
distort = Compose(self.pd[:-1]) |
|
else: |
|
distort = Compose(self.pd[1:]) |
|
img, target = distort(img, target) |
|
img, target = self.rand_light_noise(img, target) |
|
imgs.append(Image.fromarray(img.astype('uint8'))) |
|
return imgs, target |
|
|
|
|
|
class Expand(object): |
|
def __init__(self, mean): |
|
self.mean = mean |
|
def __call__(self, clip, target): |
|
if rand.randint(2): |
|
return clip,target |
|
imgs = [] |
|
masks = [] |
|
image = np.asarray(clip[0]).astype('float32') |
|
height, width, depth = image.shape |
|
ratio = rand.uniform(1, 4) |
|
left = rand.uniform(0, width*ratio - width) |
|
top = rand.uniform(0, height*ratio - height) |
|
for i in range(len(clip)): |
|
image = np.asarray(clip[i]).astype('float32') |
|
expand_image = np.zeros((int(height*ratio), int(width*ratio), depth),dtype=image.dtype) |
|
expand_image[:, :, :] = self.mean |
|
expand_image[int(top):int(top + height),int(left):int(left + width)] = image |
|
imgs.append(Image.fromarray(expand_image.astype('uint8'))) |
|
expand_mask = torch.zeros((int(height*ratio), int(width*ratio)),dtype=torch.uint8) |
|
expand_mask[int(top):int(top + height),int(left):int(left + width)] = target['masks'][i] |
|
masks.append(expand_mask) |
|
boxes = target['boxes'].numpy() |
|
boxes[:, :2] += (int(left), int(top)) |
|
boxes[:, 2:] += (int(left), int(top)) |
|
target['boxes'] = torch.tensor(boxes) |
|
target['masks']=torch.stack(masks) |
|
return imgs, target |
|
|
|
class RandomHorizontalFlip(object): |
|
def __init__(self, p=0.5): |
|
self.p = p |
|
|
|
def __call__(self, img, target): |
|
if random.random() < self.p: |
|
|
|
caption = target['caption'] |
|
target['caption'] = caption.replace('left', '@').replace('right', 'left').replace('@', 'right') |
|
return hflip(img, target) |
|
return img, target |
|
|
|
class RandomVerticalFlip(object): |
|
def __init__(self, p=0.5): |
|
self.p = p |
|
|
|
def __call__(self, img, target): |
|
if random.random() < self.p: |
|
return vflip(img, target) |
|
return img, target |
|
|
|
|
|
class RandomResize(object): |
|
def __init__(self, sizes, max_size=None): |
|
assert isinstance(sizes, (list, tuple)) |
|
self.sizes = sizes |
|
self.max_size = max_size |
|
|
|
def __call__(self, img, target=None): |
|
size = random.choice(self.sizes) |
|
return resize(img, target, size, self.max_size) |
|
|
|
|
|
class RandomPad(object): |
|
def __init__(self, max_pad): |
|
self.max_pad = max_pad |
|
|
|
def __call__(self, img, target): |
|
pad_x = random.randint(0, self.max_pad) |
|
pad_y = random.randint(0, self.max_pad) |
|
return pad(img, target, (pad_x, pad_y)) |
|
|
|
|
|
class RandomSelect(object): |
|
""" |
|
Randomly selects between transforms1 and transforms2, |
|
with probability p for transforms1 and (1 - p) for transforms2 |
|
""" |
|
def __init__(self, transforms1, transforms2, p=0.5): |
|
self.transforms1 = transforms1 |
|
self.transforms2 = transforms2 |
|
self.p = p |
|
|
|
def __call__(self, img, target): |
|
if random.random() < self.p: |
|
return self.transforms1(img, target) |
|
return self.transforms2(img, target) |
|
|
|
|
|
class ToTensor(object): |
|
def __call__(self, clip, target): |
|
img = [] |
|
for im in clip: |
|
img.append(F.to_tensor(im)) |
|
return img, target |
|
|
|
|
|
class RandomErasing(object): |
|
|
|
def __init__(self, *args, **kwargs): |
|
self.eraser = T.RandomErasing(*args, **kwargs) |
|
|
|
def __call__(self, img, target): |
|
return self.eraser(img), target |
|
|
|
|
|
class Normalize(object): |
|
def __init__(self, mean, std): |
|
self.mean = mean |
|
self.std = std |
|
|
|
def __call__(self, clip, target=None): |
|
image = [] |
|
for im in clip: |
|
image.append(F.normalize(im, mean=self.mean, std=self.std)) |
|
if target is None: |
|
return image, None |
|
target = target.copy() |
|
h, w = image[0].shape[-2:] |
|
if "boxes" in target: |
|
boxes = target["boxes"] |
|
boxes = box_xyxy_to_cxcywh(boxes) |
|
boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) |
|
target["boxes"] = boxes |
|
return image, target |
|
|
|
|
|
class Compose(object): |
|
def __init__(self, transforms): |
|
self.transforms = transforms |
|
|
|
def __call__(self, image, target): |
|
for t in self.transforms: |
|
image, target = t(image, target) |
|
return image, target |
|
|
|
def __repr__(self): |
|
format_string = self.__class__.__name__ + "(" |
|
for t in self.transforms: |
|
format_string += "\n" |
|
format_string += " {0}".format(t) |
|
format_string += "\n)" |
|
return format_string |
|
|