VRIS_vip / mbench /transforms_video.py

Add files using upload-large-folder tool

5c8ef86 verified 28 days ago

19.3 kB

	"""
	Transforms and data augmentation for sequence level images, bboxes and masks.
	"""
	import random

	import PIL
	import torch
	import torchvision.transforms as T
	import torchvision.transforms.functional as F

	from util.box_ops import box_xyxy_to_cxcywh, box_iou
	from util.misc import interpolate
	import numpy as np
	from numpy import random as rand
	from PIL import Image
	import cv2



	class Check(object):
	def __init__(self,):
	pass
	def __call__(self, img, target):
	fields = ["labels"]
	if "boxes" in target:
	fields.append("boxes")
	if "masks" in target:
	fields.append("masks")

	### check if box or mask still exist after transforms
	if "boxes" in target or "masks" in target:
	if "boxes" in target:
	cropped_boxes = target['boxes'].reshape(-1, 2, 2)
	keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
	else:
	keep = target['masks'].flatten(1).any(1)

	if False in keep:
	for k in range(len(keep)):
	if not keep[k] and "boxes" in target:
	target['boxes'][k] = target['boxes'][k]//1000.0 # [0, 0, 0, 0]

	target['valid'] = keep.to(torch.int32)

	return img, target



	def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6):
	assert mode in ['iou', 'iof']
	bboxes1 = bboxes1.astype(np.float32)
	bboxes2 = bboxes2.astype(np.float32)
	rows = bboxes1.shape[0]
	cols = bboxes2.shape[0]
	ious = np.zeros((rows, cols), dtype=np.float32)
	if rows * cols == 0:
	return ious
	exchange = False
	if bboxes1.shape[0] > bboxes2.shape[0]:
	bboxes1, bboxes2 = bboxes2, bboxes1
	ious = np.zeros((cols, rows), dtype=np.float32)
	exchange = True
	area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
	area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
	for i in range(bboxes1.shape[0]):
	x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
	y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
	x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
	y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
	overlap = np.maximum(x_end - x_start, 0) * np.maximum(y_end - y_start, 0)
	if mode == 'iou':
	union = area1[i] + area2 - overlap
	else:
	union = area1[i] if not exchange else area2
	union = np.maximum(union, eps)
	ious[i, :] = overlap / union
	if exchange:
	ious = ious.T
	return ious


	def crop(clip, target, region):
	cropped_image = []
	for image in clip:
	cropped_image.append(F.crop(image, *region))

	target = target.copy()
	i, j, h, w = region

	# should we do something wrt the original size?
	target["size"] = torch.tensor([h, w])

	fields = ["labels", "area", "iscrowd"]

	if "boxes" in target:
	boxes = target["boxes"]
	max_size = torch.as_tensor([w, h], dtype=torch.float32)
	cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
	cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
	cropped_boxes = cropped_boxes.clamp(min=0)
	area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
	target["boxes"] = cropped_boxes.reshape(-1, 4)
	target["area"] = area
	fields.append("boxes")

	if "masks" in target:
	# FIXME should we update the area here if there are no boxes?
	target['masks'] = target['masks'][:, i:i + h, j:j + w]
	fields.append("masks")

	return cropped_image, target


	def hflip(clip, target):
	flipped_image = []
	for image in clip:
	flipped_image.append(F.hflip(image))

	w, h = clip[0].size

	target = target.copy()
	if "boxes" in target:
	boxes = target["boxes"]
	boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
	target["boxes"] = boxes

	if "masks" in target:
	target['masks'] = target['masks'].flip(-1)

	return flipped_image, target

	def vflip(image,target):
	flipped_image = []
	for image in clip:
	flipped_image.append(F.vflip(image))
	w, h = clip[0].size
	target = target.copy()
	if "boxes" in target:
	boxes = target["boxes"]
	boxes = boxes[:, [0, 3, 2, 1]] * torch.as_tensor([1, -1, 1, -1]) + torch.as_tensor([0, h, 0, h])
	target["boxes"] = boxes

	if "masks" in target:
	target['masks'] = target['masks'].flip(1)

	return flipped_image, target

	def resize(clip, target, size, max_size=None):
	# size can be min_size (scalar) or (w, h) tuple

	def get_size_with_aspect_ratio(image_size, size, max_size=None):
	w, h = image_size
	if max_size is not None:
	min_original_size = float(min((w, h)))
	max_original_size = float(max((w, h)))
	if max_original_size / min_original_size * size > max_size:
	size = int(round(max_size * min_original_size / max_original_size))

	if (w <= h and w == size) or (h <= w and h == size):
	return (h, w)

	if w < h:
	ow = size
	oh = int(size * h / w)
	else:
	oh = size
	ow = int(size * w / h)

	return (oh, ow)

	def get_size(image_size, size, max_size=None):
	if isinstance(size, (list, tuple)):
	return size[::-1]
	else:
	return get_size_with_aspect_ratio(image_size, size, max_size)

	size = get_size(clip[0].size, size, max_size)
	rescaled_image = []
	for image in clip:
	rescaled_image.append(F.resize(image, size))

	if target is None:
	return rescaled_image, None

	ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image[0].size, clip[0].size))
	ratio_width, ratio_height = ratios

	target = target.copy()
	if "boxes" in target:
	boxes = target["boxes"]
	scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
	target["boxes"] = scaled_boxes

	if "area" in target:
	area = target["area"]
	scaled_area = area * (ratio_width * ratio_height)
	target["area"] = scaled_area

	h, w = size
	target["size"] = torch.tensor([h, w])

	if "masks" in target:
	if target['masks'].shape[0]>0:
	target['masks'] = interpolate(
	target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
	else:
	target['masks'] = torch.zeros((target['masks'].shape[0],h,w))
	return rescaled_image, target


	def pad(clip, target, padding):
	# assumes that we only pad on the bottom right corners
	padded_image = []
	for image in clip:
	padded_image.append(F.pad(image, (0, 0, padding[0], padding[1])))
	if target is None:
	return padded_image, None
	target = target.copy()
	# should we do something wrt the original size?
	target["size"] = torch.tensor(padded_image[0].size[::-1])
	if "masks" in target:
	target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
	return padded_image, target


	class RandomCrop(object):
	def __init__(self, size):
	self.size = size

	def __call__(self, img, target):
	region = T.RandomCrop.get_params(img, self.size)
	return crop(img, target, region)


	class RandomSizeCrop(object):
	def __init__(self, min_size: int, max_size: int):
	self.min_size = min_size
	self.max_size = max_size

	def __call__(self, img: PIL.Image.Image, target: dict):
	w = random.randint(self.min_size, min(img[0].width, self.max_size))
	h = random.randint(self.min_size, min(img[0].height, self.max_size))
	region = T.RandomCrop.get_params(img[0], [h, w])
	return crop(img, target, region)


	class CenterCrop(object):
	def __init__(self, size):
	self.size = size

	def __call__(self, img, target):
	image_width, image_height = img.size
	crop_height, crop_width = self.size
	crop_top = int(round((image_height - crop_height) / 2.))
	crop_left = int(round((image_width - crop_width) / 2.))
	return crop(img, target, (crop_top, crop_left, crop_height, crop_width))


	class MinIoURandomCrop(object):
	def __init__(self, min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3):
	self.min_ious = min_ious
	self.sample_mode = (1, *min_ious, 0)
	self.min_crop_size = min_crop_size

	def __call__(self, img, target):
	w,h = img.size
	while True:
	mode = random.choice(self.sample_mode)
	self.mode = mode
	if mode == 1:
	return img,target
	min_iou = mode
	boxes = target['boxes'].numpy()
	labels = target['labels']

	for i in range(50):
	new_w = rand.uniform(self.min_crop_size * w, w)
	new_h = rand.uniform(self.min_crop_size * h, h)
	if new_h / new_w < 0.5 or new_h / new_w > 2:
	continue
	left = rand.uniform(w - new_w)
	top = rand.uniform(h - new_h)
	patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h)))
	if patch[2] == patch[0] or patch[3] == patch[1]:
	continue
	overlaps = bbox_overlaps(patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
	if len(overlaps) > 0 and overlaps.min() < min_iou:
	continue

	if len(overlaps) > 0:
	def is_center_of_bboxes_in_patch(boxes, patch):
	center = (boxes[:, :2] + boxes[:, 2:]) / 2
	mask = ((center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (center[:, 1] < patch[3]))
	return mask
	mask = is_center_of_bboxes_in_patch(boxes, patch)
	if False in mask:
	continue
	#TODO: use no center boxes
	#if not mask.any():
	# continue

	boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
	boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
	boxes -= np.tile(patch[:2], 2)
	target['boxes'] = torch.tensor(boxes)

	img = np.asarray(img)[patch[1]:patch[3], patch[0]:patch[2]]
	img = Image.fromarray(img)
	width, height = img.size
	target['orig_size'] = torch.tensor([height,width])
	target['size'] = torch.tensor([height,width])
	return img,target


	class RandomContrast(object):
	def __init__(self, lower=0.5, upper=1.5):
	self.lower = lower
	self.upper = upper
	assert self.upper >= self.lower, "contrast upper must be >= lower."
	assert self.lower >= 0, "contrast lower must be non-negative."
	def __call__(self, image, target):

	if rand.randint(2):
	alpha = rand.uniform(self.lower, self.upper)
	image *= alpha
	return image, target

	class RandomBrightness(object):
	def __init__(self, delta=32):
	assert delta >= 0.0
	assert delta <= 255.0
	self.delta = delta
	def __call__(self, image, target):
	if rand.randint(2):
	delta = rand.uniform(-self.delta, self.delta)
	image += delta
	return image, target

	class RandomSaturation(object):
	def __init__(self, lower=0.5, upper=1.5):
	self.lower = lower
	self.upper = upper
	assert self.upper >= self.lower, "contrast upper must be >= lower."
	assert self.lower >= 0, "contrast lower must be non-negative."

	def __call__(self, image, target):
	if rand.randint(2):
	image[:, :, 1] *= rand.uniform(self.lower, self.upper)
	return image, target

	class RandomHue(object): #
	def __init__(self, delta=18.0):
	assert delta >= 0.0 and delta <= 360.0
	self.delta = delta

	def __call__(self, image, target):
	if rand.randint(2):
	image[:, :, 0] += rand.uniform(-self.delta, self.delta)
	image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
	image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
	return image, target

	class RandomLightingNoise(object):
	def __init__(self):
	self.perms = ((0, 1, 2), (0, 2, 1),
	(1, 0, 2), (1, 2, 0),
	(2, 0, 1), (2, 1, 0))
	def __call__(self, image, target):
	if rand.randint(2):
	swap = self.perms[rand.randint(len(self.perms))]
	shuffle = SwapChannels(swap) # shuffle channels
	image = shuffle(image)
	return image, target

	class ConvertColor(object):
	def __init__(self, current='BGR', transform='HSV'):
	self.transform = transform
	self.current = current

	def __call__(self, image, target):
	if self.current == 'BGR' and self.transform == 'HSV':
	image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	elif self.current == 'HSV' and self.transform == 'BGR':
	image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
	else:
	raise NotImplementedError
	return image, target

	class SwapChannels(object):
	def __init__(self, swaps):
	self.swaps = swaps
	def __call__(self, image):
	image = image[:, :, self.swaps]
	return image

	class PhotometricDistort(object):
	def __init__(self):
	self.pd = [
	RandomContrast(),
	ConvertColor(transform='HSV'),
	RandomSaturation(),
	RandomHue(),
	ConvertColor(current='HSV', transform='BGR'),
	RandomContrast()
	]
	self.rand_brightness = RandomBrightness()
	self.rand_light_noise = RandomLightingNoise()

	def __call__(self,clip,target):
	imgs = []
	for img in clip:
	img = np.asarray(img).astype('float32')
	img, target = self.rand_brightness(img, target)
	if rand.randint(2):
	distort = Compose(self.pd[:-1])
	else:
	distort = Compose(self.pd[1:])
	img, target = distort(img, target)
	img, target = self.rand_light_noise(img, target)
	imgs.append(Image.fromarray(img.astype('uint8')))
	return imgs, target

	# NOTICE: if used for mask, need to change
	class Expand(object):
	def __init__(self, mean):
	self.mean = mean
	def __call__(self, clip, target):
	if rand.randint(2):
	return clip,target
	imgs = []
	masks = []
	image = np.asarray(clip[0]).astype('float32')
	height, width, depth = image.shape
	ratio = rand.uniform(1, 4)
	left = rand.uniform(0, width*ratio - width)
	top = rand.uniform(0, height*ratio - height)
	for i in range(len(clip)):
	image = np.asarray(clip[i]).astype('float32')
	expand_image = np.zeros((int(heightratio), int(widthratio), depth),dtype=image.dtype)
	expand_image[:, :, :] = self.mean
	expand_image[int(top):int(top + height),int(left):int(left + width)] = image
	imgs.append(Image.fromarray(expand_image.astype('uint8')))
	expand_mask = torch.zeros((int(heightratio), int(widthratio)),dtype=torch.uint8)
	expand_mask[int(top):int(top + height),int(left):int(left + width)] = target['masks'][i]
	masks.append(expand_mask)
	boxes = target['boxes'].numpy()
	boxes[:, :2] += (int(left), int(top))
	boxes[:, 2:] += (int(left), int(top))
	target['boxes'] = torch.tensor(boxes)
	target['masks']=torch.stack(masks)
	return imgs, target

	class RandomHorizontalFlip(object):
	def __init__(self, p=0.5):
	self.p = p

	def __call__(self, img, target):
	if random.random() < self.p:
	# NOTE: caption for 'left' and 'right' should also change
	caption = target['caption']
	target['caption'] = caption.replace('left', '@').replace('right', 'left').replace('@', 'right')
	return hflip(img, target)
	return img, target

	class RandomVerticalFlip(object):
	def __init__(self, p=0.5):
	self.p = p

	def __call__(self, img, target):
	if random.random() < self.p:
	return vflip(img, target)
	return img, target


	class RandomResize(object):
	def __init__(self, sizes, max_size=None):
	assert isinstance(sizes, (list, tuple))
	self.sizes = sizes
	self.max_size = max_size

	def __call__(self, img, target=None):
	size = random.choice(self.sizes)
	return resize(img, target, size, self.max_size)


	class RandomPad(object):
	def __init__(self, max_pad):
	self.max_pad = max_pad

	def __call__(self, img, target):
	pad_x = random.randint(0, self.max_pad)
	pad_y = random.randint(0, self.max_pad)
	return pad(img, target, (pad_x, pad_y))


	class RandomSelect(object):
	"""
	Randomly selects between transforms1 and transforms2,
	with probability p for transforms1 and (1 - p) for transforms2
	"""
	def __init__(self, transforms1, transforms2, p=0.5):
	self.transforms1 = transforms1
	self.transforms2 = transforms2
	self.p = p

	def __call__(self, img, target):
	if random.random() < self.p:
	return self.transforms1(img, target)
	return self.transforms2(img, target)


	class ToTensor(object):
	def __call__(self, clip, target):
	img = []
	for im in clip:
	img.append(F.to_tensor(im))
	return img, target


	class RandomErasing(object):

	def __init__(self, args, *kwargs):
	self.eraser = T.RandomErasing(args, *kwargs)

	def __call__(self, img, target):
	return self.eraser(img), target


	class Normalize(object):
	def __init__(self, mean, std):
	self.mean = mean
	self.std = std

	def __call__(self, clip, target=None):
	image = []
	for im in clip:
	image.append(F.normalize(im, mean=self.mean, std=self.std))
	if target is None:
	return image, None
	target = target.copy()
	h, w = image[0].shape[-2:]
	if "boxes" in target:
	boxes = target["boxes"]
	boxes = box_xyxy_to_cxcywh(boxes)
	boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
	target["boxes"] = boxes
	return image, target


	class Compose(object):
	def __init__(self, transforms):
	self.transforms = transforms

	def __call__(self, image, target):
	for t in self.transforms:
	image, target = t(image, target)
	return image, target

	def __repr__(self):
	format_string = self.__class__.__name__ + "("
	for t in self.transforms:
	format_string += "\n"
	format_string += " {0}".format(t)
	format_string += "\n)"
	return format_string