# This code was taken from: https://github.com/assafshocher/resizer by Assaf Shocher | |
import numpy as np | |
import torch | |
from math import pi | |
from torch import nn | |
class Resizer(nn.Module): | |
def __init__(self, in_shape, scale_factor=None, output_shape=None, kernel=None, antialiasing=True): | |
super(Resizer, self).__init__() | |
# First standardize values and fill missing arguments (if needed) by deriving scale from output shape or vice versa | |
scale_factor, output_shape = self.fix_scale_and_size(in_shape, output_shape, scale_factor) | |
# Choose interpolation method, each method has the matching kernel size | |
method, kernel_width = { | |
"cubic": (cubic, 4.0), | |
"lanczos2": (lanczos2, 4.0), | |
"lanczos3": (lanczos3, 6.0), | |
"box": (box, 1.0), | |
"linear": (linear, 2.0), | |
None: (cubic, 4.0) # set default interpolation method as cubic | |
}.get(kernel) | |
# Antialiasing is only used when downscaling | |
antialiasing *= (np.any(np.array(scale_factor) < 1)) | |
# Sort indices of dimensions according to scale of each dimension. since we are going dim by dim this is efficient | |
sorted_dims = np.argsort(np.array(scale_factor)) | |
self.sorted_dims = [int(dim) for dim in sorted_dims if scale_factor[dim] != 1] | |
# Iterate over dimensions to calculate local weights for resizing and resize each time in one direction | |
field_of_view_list = [] | |
weights_list = [] | |
for dim in self.sorted_dims: | |
# for each coordinate (along 1 dim), calculate which coordinates in the input image affect its result and the | |
# weights that multiply the values there to get its result. | |
weights, field_of_view = self.contributions(in_shape[dim], output_shape[dim], scale_factor[dim], method, | |
kernel_width, antialiasing) | |
# convert to torch tensor | |
weights = torch.tensor(weights.T, dtype=torch.float32) | |
# We add singleton dimensions to the weight matrix so we can multiply it with the big tensor we get for | |
# tmp_im[field_of_view.T], (bsxfun style) | |
weights_list.append( | |
nn.Parameter(torch.reshape(weights, list(weights.shape) + (len(scale_factor) - 1) * [1]), | |
requires_grad=False)) | |
field_of_view_list.append( | |
nn.Parameter(torch.tensor(field_of_view.T.astype(np.int32), dtype=torch.long), requires_grad=False)) | |
self.field_of_view = nn.ParameterList(field_of_view_list) | |
self.weights = nn.ParameterList(weights_list) | |
def forward(self, in_tensor): | |
x = in_tensor | |
# Use the affecting position values and the set of weights to calculate the result of resizing along this 1 dim | |
for dim, fov, w in zip(self.sorted_dims, self.field_of_view, self.weights): | |
# To be able to act on each dim, we swap so that dim 0 is the wanted dim to resize | |
x = torch.transpose(x, dim, 0) | |
# This is a bit of a complicated multiplication: x[field_of_view.T] is a tensor of order image_dims+1. | |
# for each pixel in the output-image it matches the positions the influence it from the input image (along 1 dim | |
# only, this is why it only adds 1 dim to 5the shape). We then multiply, for each pixel, its set of positions with | |
# the matching set of weights. we do this by this big tensor element-wise multiplication (MATLAB bsxfun style: | |
# matching dims are multiplied element-wise while singletons mean that the matching dim is all multiplied by the | |
# same number | |
x = torch.sum(x[fov] * w, dim=0) | |
# Finally we swap back the axes to the original order | |
x = torch.transpose(x, dim, 0) | |
return x | |
def fix_scale_and_size(self, input_shape, output_shape, scale_factor): | |
# First fixing the scale-factor (if given) to be standardized the function expects (a list of scale factors in the | |
# same size as the number of input dimensions) | |
if scale_factor is not None: | |
# By default, if scale-factor is a scalar we assume 2d resizing and duplicate it. | |
if np.isscalar(scale_factor) and len(input_shape) > 1: | |
scale_factor = [scale_factor, scale_factor] | |
# We extend the size of scale-factor list to the size of the input by assigning 1 to all the unspecified scales | |
scale_factor = list(scale_factor) | |
scale_factor = [1] * (len(input_shape) - len(scale_factor)) + scale_factor | |
# Fixing output-shape (if given): extending it to the size of the input-shape, by assigning the original input-size | |
# to all the unspecified dimensions | |
if output_shape is not None: | |
output_shape = list(input_shape[len(output_shape):]) + list(np.uint(np.array(output_shape))) | |
# Dealing with the case of non-give scale-factor, calculating according to output-shape. note that this is | |
# sub-optimal, because there can be different scales to the same output-shape. | |
if scale_factor is None: | |
scale_factor = 1.0 * np.array(output_shape) / np.array(input_shape) | |
# Dealing with missing output-shape. calculating according to scale-factor | |
if output_shape is None: | |
output_shape = np.uint(np.ceil(np.array(input_shape) * np.array(scale_factor))) | |
return scale_factor, output_shape | |
def contributions(self, in_length, out_length, scale, kernel, kernel_width, antialiasing): | |
# This function calculates a set of 'filters' and a set of field_of_view that will later on be applied | |
# such that each position from the field_of_view will be multiplied with a matching filter from the | |
# 'weights' based on the interpolation method and the distance of the sub-pixel location from the pixel centers | |
# around it. This is only done for one dimension of the image. | |
# When anti-aliasing is activated (default and only for downscaling) the receptive field is stretched to size of | |
# 1/sf. this means filtering is more 'low-pass filter'. | |
fixed_kernel = (lambda arg: scale * kernel(scale * arg)) if antialiasing else kernel | |
kernel_width *= 1.0 / scale if antialiasing else 1.0 | |
# These are the coordinates of the output image | |
out_coordinates = np.arange(1, out_length + 1) | |
# since both scale-factor and output size can be provided simulatneously, perserving the center of the image requires shifting | |
# the output coordinates. the deviation is because out_length doesn't necesary equal in_length*scale. | |
# to keep the center we need to subtract half of this deivation so that we get equal margins for boths sides and center is preserved. | |
shifted_out_coordinates = out_coordinates - (out_length - in_length * scale) / 2 | |
# These are the matching positions of the output-coordinates on the input image coordinates. | |
# Best explained by example: say we have 4 horizontal pixels for HR and we downscale by SF=2 and get 2 pixels: | |
# [1,2,3,4] -> [1,2]. Remember each pixel number is the middle of the pixel. | |
# The scaling is done between the distances and not pixel numbers (the right boundary of pixel 4 is transformed to | |
# the right boundary of pixel 2. pixel 1 in the small image matches the boundary between pixels 1 and 2 in the big | |
# one and not to pixel 2. This means the position is not just multiplication of the old pos by scale-factor). | |
# So if we measure distance from the left border, middle of pixel 1 is at distance d=0.5, border between 1 and 2 is | |
# at d=1, and so on (d = p - 0.5). we calculate (d_new = d_old / sf) which means: | |
# (p_new-0.5 = (p_old-0.5) / sf) -> p_new = p_old/sf + 0.5 * (1-1/sf) | |
match_coordinates = shifted_out_coordinates / scale + 0.5 * (1 - 1 / scale) | |
# This is the left boundary to start multiplying the filter from, it depends on the size of the filter | |
left_boundary = np.floor(match_coordinates - kernel_width / 2) | |
# Kernel width needs to be enlarged because when covering has sub-pixel borders, it must 'see' the pixel centers | |
# of the pixels it only covered a part from. So we add one pixel at each side to consider (weights can zeroize them) | |
expanded_kernel_width = np.ceil(kernel_width) + 2 | |
# Determine a set of field_of_view for each each output position, these are the pixels in the input image | |
# that the pixel in the output image 'sees'. We get a matrix whos horizontal dim is the output pixels (big) and the | |
# vertical dim is the pixels it 'sees' (kernel_size + 2) | |
field_of_view = np.squeeze( | |
np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1)) | |
# Assign weight to each pixel in the field of view. A matrix whos horizontal dim is the output pixels and the | |
# vertical dim is a list of weights matching to the pixel in the field of view (that are specified in | |
# 'field_of_view') | |
weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1) | |
# Normalize weights to sum up to 1. be careful from dividing by 0 | |
sum_weights = np.sum(weights, axis=1) | |
sum_weights[sum_weights == 0] = 1.0 | |
weights = 1.0 * weights / np.expand_dims(sum_weights, axis=1) | |
# We use this mirror structure as a trick for reflection padding at the boundaries | |
mirror = np.uint(np.concatenate((np.arange(in_length), np.arange(in_length - 1, -1, step=-1)))) | |
field_of_view = mirror[np.mod(field_of_view, mirror.shape[0])] | |
# Get rid of weights and pixel positions that are of zero weight | |
non_zero_out_pixels = np.nonzero(np.any(weights, axis=0)) | |
weights = np.squeeze(weights[:, non_zero_out_pixels]) | |
field_of_view = np.squeeze(field_of_view[:, non_zero_out_pixels]) | |
# Final products are the relative positions and the matching weights, both are output_size X fixed_kernel_size | |
return weights, field_of_view | |
# These next functions are all interpolation methods. x is the distance from the left pixel center | |
def cubic(x): | |
absx = np.abs(x) | |
absx2 = absx ** 2 | |
absx3 = absx ** 3 | |
return ((1.5 * absx3 - 2.5 * absx2 + 1) * (absx <= 1) + | |
(-0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2) * ((1 < absx) & (absx <= 2))) | |
def lanczos2(x): | |
return (((np.sin(pi * x) * np.sin(pi * x / 2) + np.finfo(np.float32).eps) / | |
((pi ** 2 * x ** 2 / 2) + np.finfo(np.float32).eps)) | |
* (abs(x) < 2)) | |
def box(x): | |
return ((-0.5 <= x) & (x < 0.5)) * 1.0 | |
def lanczos3(x): | |
return (((np.sin(pi * x) * np.sin(pi * x / 3) + np.finfo(np.float32).eps) / | |
((pi ** 2 * x ** 2 / 3) + np.finfo(np.float32).eps)) | |
* (abs(x) < 3)) | |
def linear(x): | |
return (x + 1) * ((-1 <= x) & (x < 0)) + (1 - x) * ((0 <= x) & (x <= 1)) |