diff --git a/README.md b/README.md index fceaeabdc6e1aa2cddf90406b076d11663641a35..3b3e4807e53713ab24d7d9d65ab6046b0f7145dc 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,112 @@ -# ALIKE +# ALIKE: Accurate and Lightweight Keypoint Detection and Descriptor Extraction -The code will be released after the paper has been accepted. +ALIKE applies a differentiable keypoint detection module to detect accurate sub-pixel keypoints. The network can run at 95 frames per second for 640 x 480 images on NVIDIA Titan RTX GPU and achieve equivalent performance with the state-of-the-arts. ALIKE benefits real-time applications in resource-limited platforms/devices. Technical details are described in [this paper](https://arxiv.org/pdf/2112.02906.pdf). + +> ``` +> Xiaoming Zhao, Xingming Wu, Jinyu Miao, Weihai Chen, Peter C. Y. Chen, Zhengguo Li, "ALIKE: Accurate and Lightweight Keypoint +> Detection and Descriptor Extraction," IEEE Transactions on Multimedia, 2022. +> ``` + +![](./assets/alike.png) + + +If you use ALIKE in an academic work, please cite: + +``` +@article{Zhao2022ALIKE, + title={ALIKE: Accurate and Lightweight Keypoint Detection and Descriptor Extraction}, + author={Xiaoming Zhao and Xingming Wu and Jinyu Miao and Weihai Chen and Peter C. Y. Chen and Zhengguo Li}, + journal={IEEE Transactions on Multimedia}, + year={2022} +} +``` + + + +## 1. Prerequisites + +The required packages are listed in the `requirements.txt` : + +```shell +pip install -r requirements.txt +``` + + + +## 2. Models + +The off-the-shelf weights of four variant ALIKE models are provided in `models/` . + + + +## 3. Run demo + +```shell +$ python demo.py -h +usage: demo.py [-h] [--model {alike-t,alike-s,alike-n,alike-l}] + [--device DEVICE] [--top_k TOP_K] [--scores_th SCORES_TH] + [--n_limit N_LIMIT] [--no_display] [--no_sub_pixel] + input + +ALike Demo. + +positional arguments: + input Image directory or movie file or "camera0" (for + webcam0). + +optional arguments: + -h, --help show this help message and exit + --model {alike-t,alike-s,alike-n,alike-l} + The model configuration + --device DEVICE Running device (default: cuda). + --top_k TOP_K Detect top K keypoints. -1 for threshold based mode, + >0 for top K mode. (default: -1) + --scores_th SCORES_TH + Detector score threshold (default: 0.2). + --n_limit N_LIMIT Maximum number of keypoints to be detected (default: + 5000). + --no_display Do not display images to screen. Useful if running + remotely (default: False). + --no_sub_pixel Do not detect sub-pixel keypoints (default: False). +``` + + + +## 4. Examples + +### KITTI example +```shell +python demo.py assets/kitti +``` +![](./assets/kitti.gif) + +### TUM example +```shell +python demo.py assets/tum +``` +![](./assets/tum.gif) + +## 5. Efficiency and performance + +| Models | Parameters | GFLOPs(640x480) | MHA@3 on Hpatches | mAA(10°) on [IMW2020-test](https://www.cs.ubc.ca/research/image-matching-challenge/2021/leaderboard) (Stereo) | +|:---:|:---:|:---:|:-----------------:|:-------------------------------------------------------------------------------------------------------------:| +| D2-Net(MS) | 7653KB | 889.40 | 38.33% | 12.27% | +| LF-Net(MS) | 2642KB | 24.37 | 57.78% | 23.44% | +| SuperPoint | 1301KB | 26.11 | 70.19% | 28.97% | +| R2D2(MS) | 484KB | 464.55 | 71.48% | 39.02% | +| ASLFeat(MS) | 823KB | 77.58 | 73.52% | 33.65% | +| DISK | 1092KB | 98.97 | 70.56% | 51.22% | +| ALike-N | 318KB | 7.909 | 75.74% | 47.18% | +| ALike-L | 653KB | 19.685 | 76.85% | 49.58% | + +### Evaluation on Hpatches + +- Download [hpatches-sequences-release](https://hpatches.github.io/) and put it into `hseq/hpatches-sequences-release`. +- Remove the unrelaiable sequences as D2-Net. +- Run the following command to evaluate the performance: + ```shell + python hseq/eval.py + ``` + + +For more details, please refer to the [paper](https://arxiv.org/abs/2112.02906). \ No newline at end of file diff --git a/alike.py b/alike.py new file mode 100644 index 0000000000000000000000000000000000000000..303616d52581efce0ae0eb86af70f5ea8984909d --- /dev/null +++ b/alike.py @@ -0,0 +1,143 @@ +import logging +import os +import cv2 +import torch +from copy import deepcopy +import torch.nn.functional as F +from torchvision.transforms import ToTensor +import math + +from alnet import ALNet +from soft_detect import DKD +import time + +configs = { + 'alike-t': {'c1': 8, 'c2': 16, 'c3': 32, 'c4': 64, 'dim': 64, 'single_head': True, 'radius': 2, + 'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-t.pth')}, + 'alike-s': {'c1': 8, 'c2': 16, 'c3': 48, 'c4': 96, 'dim': 96, 'single_head': True, 'radius': 2, + 'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-s.pth')}, + 'alike-n': {'c1': 16, 'c2': 32, 'c3': 64, 'c4': 128, 'dim': 128, 'single_head': True, 'radius': 2, + 'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-n.pth')}, + 'alike-l': {'c1': 32, 'c2': 64, 'c3': 128, 'c4': 128, 'dim': 128, 'single_head': False, 'radius': 2, + 'model_path': os.path.join(os.path.split(__file__)[0], 'models', 'alike-l.pth')}, +} + + +class ALike(ALNet): + def __init__(self, + # ================================== feature encoder + c1: int = 32, c2: int = 64, c3: int = 128, c4: int = 128, dim: int = 128, + single_head: bool = False, + # ================================== detect parameters + radius: int = 2, + top_k: int = 500, scores_th: float = 0.5, + n_limit: int = 5000, + device: str = 'cpu', + model_path: str = '' + ): + super().__init__(c1, c2, c3, c4, dim, single_head) + self.radius = radius + self.top_k = top_k + self.n_limit = n_limit + self.scores_th = scores_th + self.dkd = DKD(radius=self.radius, top_k=self.top_k, + scores_th=self.scores_th, n_limit=self.n_limit) + self.device = device + + if model_path != '': + state_dict = torch.load(model_path, self.device) + self.load_state_dict(state_dict) + self.to(self.device) + self.eval() + logging.info(f'Loaded model parameters from {model_path}') + logging.info( + f"Number of model parameters: {sum(p.numel() for p in self.parameters() if p.requires_grad) / 1e3}KB") + + def extract_dense_map(self, image, ret_dict=False): + # ==================================================== + # check image size, should be integer multiples of 2^5 + # if it is not a integer multiples of 2^5, padding zeros + device = image.device + b, c, h, w = image.shape + h_ = math.ceil(h / 32) * 32 if h % 32 != 0 else h + w_ = math.ceil(w / 32) * 32 if w % 32 != 0 else w + if h_ != h: + h_padding = torch.zeros(b, c, h_ - h, w, device=device) + image = torch.cat([image, h_padding], dim=2) + if w_ != w: + w_padding = torch.zeros(b, c, h_, w_ - w, device=device) + image = torch.cat([image, w_padding], dim=3) + # ==================================================== + + scores_map, descriptor_map = super().forward(image) + + # ==================================================== + if h_ != h or w_ != w: + descriptor_map = descriptor_map[:, :, :h, :w] + scores_map = scores_map[:, :, :h, :w] # Bx1xHxW + # ==================================================== + + # BxCxHxW + descriptor_map = torch.nn.functional.normalize(descriptor_map, p=2, dim=1) + + if ret_dict: + return {'descriptor_map': descriptor_map, 'scores_map': scores_map, } + else: + return descriptor_map, scores_map + + def forward(self, img, image_size_max=99999, sort=False, sub_pixel=False): + """ + :param img: np.array HxWx3, RGB + :param image_size_max: maximum image size, otherwise, the image will be resized + :param sort: sort keypoints by scores + :param sub_pixel: whether to use sub-pixel accuracy + :return: a dictionary with 'keypoints', 'descriptors', 'scores', and 'time' + """ + H, W, three = img.shape + assert three == 3, "input image shape should be [HxWx3]" + + # ==================== image size constraint + image = deepcopy(img) + max_hw = max(H, W) + if max_hw > image_size_max: + ratio = float(image_size_max / max_hw) + image = cv2.resize(image, dsize=None, fx=ratio, fy=ratio) + + # ==================== convert image to tensor + image = torch.from_numpy(image).to(self.device).to(torch.float32).permute(2, 0, 1)[None] / 255.0 + + # ==================== extract keypoints + start = time.time() + + with torch.no_grad(): + descriptor_map, scores_map = self.extract_dense_map(image) + keypoints, descriptors, scores, _ = self.dkd(scores_map, descriptor_map, + sub_pixel=sub_pixel) + keypoints, descriptors, scores = keypoints[0], descriptors[0], scores[0] + keypoints = (keypoints + 1) / 2 * keypoints.new_tensor([[W - 1, H - 1]]) + + if sort: + indices = torch.argsort(scores, descending=True) + keypoints = keypoints[indices] + descriptors = descriptors[indices] + scores = scores[indices] + + end = time.time() + + return {'keypoints': keypoints.cpu().numpy(), + 'descriptors': descriptors.cpu().numpy(), + 'scores': scores.cpu().numpy(), + 'scores_map': scores_map.cpu().numpy(), + 'time': end - start, } + + +if __name__ == '__main__': + import numpy as np + from thop import profile + + net = ALike(c1=32, c2=64, c3=128, c4=128, dim=128, single_head=False) + + image = np.random.random((640, 480, 3)).astype(np.float32) + flops, params = profile(net, inputs=(image, 9999, False), verbose=False) + print('{:<30} {:<8} GFLops'.format('Computational complexity: ', flops / 1e9)) + print('{:<30} {:<8} KB'.format('Number of parameters: ', params / 1e3)) diff --git a/alnet.py b/alnet.py new file mode 100644 index 0000000000000000000000000000000000000000..53127063233660c7b96aa15e89aa4a8a1a340dd1 --- /dev/null +++ b/alnet.py @@ -0,0 +1,164 @@ +import torch +from torch import nn +from torchvision.models import resnet +from typing import Optional, Callable + + +class ConvBlock(nn.Module): + def __init__(self, in_channels, out_channels, + gate: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None): + super().__init__() + if gate is None: + self.gate = nn.ReLU(inplace=True) + else: + self.gate = gate + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self.conv1 = resnet.conv3x3(in_channels, out_channels) + self.bn1 = norm_layer(out_channels) + self.conv2 = resnet.conv3x3(out_channels, out_channels) + self.bn2 = norm_layer(out_channels) + + def forward(self, x): + x = self.gate(self.bn1(self.conv1(x))) # B x in_channels x H x W + x = self.gate(self.bn2(self.conv2(x))) # B x out_channels x H x W + return x + + +# copied from torchvision\models\resnet.py#27->BasicBlock +class ResBlock(nn.Module): + expansion: int = 1 + + def __init__( + self, + inplanes: int, + planes: int, + stride: int = 1, + downsample: Optional[nn.Module] = None, + groups: int = 1, + base_width: int = 64, + dilation: int = 1, + gate: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None + ) -> None: + super(ResBlock, self).__init__() + if gate is None: + self.gate = nn.ReLU(inplace=True) + else: + self.gate = gate + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError('ResBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in ResBlock") + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = resnet.conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.conv2 = resnet.conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x: torch.Tensor) -> torch.Tensor: + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.gate(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.gate(out) + + return out + + +class ALNet(nn.Module): + def __init__(self, c1: int = 32, c2: int = 64, c3: int = 128, c4: int = 128, dim: int = 128, + single_head: bool = True, + ): + super().__init__() + + self.gate = nn.ReLU(inplace=True) + + self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) + self.pool4 = nn.MaxPool2d(kernel_size=4, stride=4) + + self.block1 = ConvBlock(3, c1, self.gate, nn.BatchNorm2d) + + self.block2 = ResBlock(inplanes=c1, planes=c2, stride=1, + downsample=nn.Conv2d(c1, c2, 1), + gate=self.gate, + norm_layer=nn.BatchNorm2d) + self.block3 = ResBlock(inplanes=c2, planes=c3, stride=1, + downsample=nn.Conv2d(c2, c3, 1), + gate=self.gate, + norm_layer=nn.BatchNorm2d) + self.block4 = ResBlock(inplanes=c3, planes=c4, stride=1, + downsample=nn.Conv2d(c3, c4, 1), + gate=self.gate, + norm_layer=nn.BatchNorm2d) + + # ================================== feature aggregation + self.conv1 = resnet.conv1x1(c1, dim // 4) + self.conv2 = resnet.conv1x1(c2, dim // 4) + self.conv3 = resnet.conv1x1(c3, dim // 4) + self.conv4 = resnet.conv1x1(dim, dim // 4) + self.upsample2 = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + self.upsample4 = nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True) + self.upsample8 = nn.Upsample(scale_factor=8, mode='bilinear', align_corners=True) + self.upsample32 = nn.Upsample(scale_factor=32, mode='bilinear', align_corners=True) + + # ================================== detector and descriptor head + self.single_head = single_head + if not self.single_head: + self.convhead1 = resnet.conv1x1(dim, dim) + self.convhead2 = resnet.conv1x1(dim, dim + 1) + + def forward(self, image): + # ================================== feature encoder + x1 = self.block1(image) # B x c1 x H x W + x2 = self.pool2(x1) + x2 = self.block2(x2) # B x c2 x H/2 x W/2 + x3 = self.pool4(x2) + x3 = self.block3(x3) # B x c3 x H/8 x W/8 + x4 = self.pool4(x3) + x4 = self.block4(x4) # B x dim x H/32 x W/32 + + # ================================== feature aggregation + x1 = self.gate(self.conv1(x1)) # B x dim//4 x H x W + x2 = self.gate(self.conv2(x2)) # B x dim//4 x H//2 x W//2 + x3 = self.gate(self.conv3(x3)) # B x dim//4 x H//8 x W//8 + x4 = self.gate(self.conv4(x4)) # B x dim//4 x H//32 x W//32 + x2_up = self.upsample2(x2) # B x dim//4 x H x W + x3_up = self.upsample8(x3) # B x dim//4 x H x W + x4_up = self.upsample32(x4) # B x dim//4 x H x W + x1234 = torch.cat([x1, x2_up, x3_up, x4_up], dim=1) + + # ================================== detector and descriptor head + if not self.single_head: + x1234 = self.gate(self.convhead1(x1234)) + x = self.convhead2(x1234) # B x dim+1 x H x W + + descriptor_map = x[:, :-1, :, :] + scores_map = torch.sigmoid(x[:, -1, :, :]).unsqueeze(1) + + return scores_map, descriptor_map + + +if __name__ == '__main__': + from thop import profile + + net = ALNet(c1=16, c2=32, c3=64, c4=128, dim=128, single_head=True) + + image = torch.randn(1, 3, 640, 480) + flops, params = profile(net, inputs=(image,), verbose=False) + print('{:<30} {:<8} GFLops'.format('Computational complexity: ', flops / 1e9)) + print('{:<30} {:<8} KB'.format('Number of parameters: ', params / 1e3)) diff --git a/assets/alike.png b/assets/alike.png new file mode 100644 index 0000000000000000000000000000000000000000..94364a48fff257b44fbfcbc2176a6a0814f80a1a Binary files /dev/null and b/assets/alike.png differ diff --git a/assets/kitti/000100.png b/assets/kitti/000100.png new file mode 100644 index 0000000000000000000000000000000000000000..19d0a44a1f18f0d9a854b039377aab215b893227 Binary files /dev/null and b/assets/kitti/000100.png differ diff --git a/assets/kitti/000101.png b/assets/kitti/000101.png new file mode 100644 index 0000000000000000000000000000000000000000..a18a2f4b98aef265b145966f1764a889eceaf6d5 Binary files /dev/null and b/assets/kitti/000101.png differ diff --git a/assets/kitti/000102.png b/assets/kitti/000102.png new file mode 100644 index 0000000000000000000000000000000000000000..345f100489e662cffbbb83cb29ecd41bea3cc282 Binary files /dev/null and b/assets/kitti/000102.png differ diff --git a/assets/kitti/000103.png b/assets/kitti/000103.png new file mode 100644 index 0000000000000000000000000000000000000000..53da3b8a24dfa7128bda19700a2abfc8fb1873dc Binary files /dev/null and b/assets/kitti/000103.png differ diff --git a/assets/kitti/000104.png b/assets/kitti/000104.png new file mode 100644 index 0000000000000000000000000000000000000000..fab7af64da011e5dbe74a035e63cd67303e37dbf Binary files /dev/null and b/assets/kitti/000104.png differ diff --git a/assets/kitti/000105.png b/assets/kitti/000105.png new file mode 100644 index 0000000000000000000000000000000000000000..ec04a7a01ea9998362ee895d8aac51c32ec184f4 Binary files /dev/null and b/assets/kitti/000105.png differ diff --git a/assets/kitti/000106.png b/assets/kitti/000106.png new file mode 100644 index 0000000000000000000000000000000000000000..798e9b4beda76535137800a5dd590e7c22afee4a Binary files /dev/null and b/assets/kitti/000106.png differ diff --git a/assets/kitti/000107.png b/assets/kitti/000107.png new file mode 100644 index 0000000000000000000000000000000000000000..18e9f86a411ae11ae55e493c9656a75c34043cac Binary files /dev/null and b/assets/kitti/000107.png differ diff --git a/assets/kitti/000108.png b/assets/kitti/000108.png new file mode 100644 index 0000000000000000000000000000000000000000..8772966f3a781a1d9eafd9ef304c4504377ebc92 Binary files /dev/null and b/assets/kitti/000108.png differ diff --git a/assets/kitti/000109.png b/assets/kitti/000109.png new file mode 100644 index 0000000000000000000000000000000000000000..42b039f108b2c6bb688039ab3bd202d1d8463867 Binary files /dev/null and b/assets/kitti/000109.png differ diff --git a/assets/kitti/000110.png b/assets/kitti/000110.png new file mode 100644 index 0000000000000000000000000000000000000000..da7b96aa73a5f54154ff1a1d328ee4206d740af7 Binary files /dev/null and b/assets/kitti/000110.png differ diff --git a/assets/kitti/000111.png b/assets/kitti/000111.png new file mode 100644 index 0000000000000000000000000000000000000000..76419d0da9c6fbd5385a162976a4fdbd9c87cc3e Binary files /dev/null and b/assets/kitti/000111.png differ diff --git a/assets/kitti/000112.png b/assets/kitti/000112.png new file mode 100644 index 0000000000000000000000000000000000000000..ab4017ca8c0918fdbffbd087ce8d5addfa7e6a1d Binary files /dev/null and b/assets/kitti/000112.png differ diff --git a/assets/kitti/000113.png b/assets/kitti/000113.png new file mode 100644 index 0000000000000000000000000000000000000000..6639a15922174a31091d9f5cf1194f475d1ace08 Binary files /dev/null and b/assets/kitti/000113.png differ diff --git a/assets/kitti/000114.png b/assets/kitti/000114.png new file mode 100644 index 0000000000000000000000000000000000000000..a9c8f445d7201f30adf71edc8d71e0335e42b843 Binary files /dev/null and b/assets/kitti/000114.png differ diff --git a/assets/kitti/000115.png b/assets/kitti/000115.png new file mode 100644 index 0000000000000000000000000000000000000000..9f015a03ee99165d127e3f7cd58f7c43ad50b65e Binary files /dev/null and b/assets/kitti/000115.png differ diff --git a/assets/kitti/000116.png b/assets/kitti/000116.png new file mode 100644 index 0000000000000000000000000000000000000000..940aa8f31ca48673248fc524905ab1be0388b6f4 Binary files /dev/null and b/assets/kitti/000116.png differ diff --git a/assets/kitti/000117.png b/assets/kitti/000117.png new file mode 100644 index 0000000000000000000000000000000000000000..61f70a25b867cd318a17ddc1665c2fc80fe1d16c Binary files /dev/null and b/assets/kitti/000117.png differ diff --git a/assets/kitti/000118.png b/assets/kitti/000118.png new file mode 100644 index 0000000000000000000000000000000000000000..b023d595c3589228c3bd8df3dbfa5b03bb5eb1de Binary files /dev/null and b/assets/kitti/000118.png differ diff --git a/assets/kitti/000119.png b/assets/kitti/000119.png new file mode 100644 index 0000000000000000000000000000000000000000..486acd171cfcafe98a6ef215e1f4b8f7a27ead91 Binary files /dev/null and b/assets/kitti/000119.png differ diff --git a/assets/tum/1311868169.163498.png b/assets/tum/1311868169.163498.png new file mode 100644 index 0000000000000000000000000000000000000000..2c064fbf709450baccd763a704b5a2c9af743fb2 Binary files /dev/null and b/assets/tum/1311868169.163498.png differ diff --git a/assets/tum/1311868169.263274.png b/assets/tum/1311868169.263274.png new file mode 100644 index 0000000000000000000000000000000000000000..084bd5909efaead02c7729b326db806512e6b06f Binary files /dev/null and b/assets/tum/1311868169.263274.png differ diff --git a/assets/tum/1311868169.363470.png b/assets/tum/1311868169.363470.png new file mode 100644 index 0000000000000000000000000000000000000000..f5a7bd456efc2d40f69899307409a50569e48dd7 Binary files /dev/null and b/assets/tum/1311868169.363470.png differ diff --git a/assets/tum/1311868169.463229.png b/assets/tum/1311868169.463229.png new file mode 100644 index 0000000000000000000000000000000000000000..0535f4ae2349c105bb99e9602b26029d1faaa235 Binary files /dev/null and b/assets/tum/1311868169.463229.png differ diff --git a/assets/tum/1311868169.563501.png b/assets/tum/1311868169.563501.png new file mode 100644 index 0000000000000000000000000000000000000000..af2098d4e362e2b19fac8f49215b90746c4a9385 Binary files /dev/null and b/assets/tum/1311868169.563501.png differ diff --git a/assets/tum/1311868169.663240.png b/assets/tum/1311868169.663240.png new file mode 100644 index 0000000000000000000000000000000000000000..615f78fc5eeac7b614c7834f20a7c05b73ed41fa Binary files /dev/null and b/assets/tum/1311868169.663240.png differ diff --git a/assets/tum/1311868169.763417.png b/assets/tum/1311868169.763417.png new file mode 100644 index 0000000000000000000000000000000000000000..bad1586fec3e30cb924b48dc4dd9a8153cd8983c Binary files /dev/null and b/assets/tum/1311868169.763417.png differ diff --git a/assets/tum/1311868169.863396.png b/assets/tum/1311868169.863396.png new file mode 100644 index 0000000000000000000000000000000000000000..c3295810b9eceadc189e7e6cbb8922274161a1ef Binary files /dev/null and b/assets/tum/1311868169.863396.png differ diff --git a/assets/tum/1311868169.963415.png b/assets/tum/1311868169.963415.png new file mode 100644 index 0000000000000000000000000000000000000000..b29e7a4fc960c6b14f1d2f6085e82dee96a20b68 Binary files /dev/null and b/assets/tum/1311868169.963415.png differ diff --git a/assets/tum/1311868170.063469.png b/assets/tum/1311868170.063469.png new file mode 100644 index 0000000000000000000000000000000000000000..bd0a32025db87f074dc98e8d78b918f90cb265e0 Binary files /dev/null and b/assets/tum/1311868170.063469.png differ diff --git a/assets/tum/1311868170.163416.png b/assets/tum/1311868170.163416.png new file mode 100644 index 0000000000000000000000000000000000000000..ccfb3cc678acfac47882df9c0f2e7943d28747c8 Binary files /dev/null and b/assets/tum/1311868170.163416.png differ diff --git a/assets/tum/1311868170.263521.png b/assets/tum/1311868170.263521.png new file mode 100644 index 0000000000000000000000000000000000000000..f0ee140003de557d471a560da6ff4c3ed2dfab6f Binary files /dev/null and b/assets/tum/1311868170.263521.png differ diff --git a/assets/tum/1311868170.363400.png b/assets/tum/1311868170.363400.png new file mode 100644 index 0000000000000000000000000000000000000000..5152abd646dff6c93a397040778caf8592f890e4 Binary files /dev/null and b/assets/tum/1311868170.363400.png differ diff --git a/assets/tum/1311868170.463383.png b/assets/tum/1311868170.463383.png new file mode 100644 index 0000000000000000000000000000000000000000..a8e2ee44e1ef95c0f15020164af59ea1e6c86003 Binary files /dev/null and b/assets/tum/1311868170.463383.png differ diff --git a/assets/tum/1311868170.563345.png b/assets/tum/1311868170.563345.png new file mode 100644 index 0000000000000000000000000000000000000000..4554920801b73344dd1da7f103bc3e43711caa11 Binary files /dev/null and b/assets/tum/1311868170.563345.png differ diff --git a/assets/tum/1311868170.663430.png b/assets/tum/1311868170.663430.png new file mode 100644 index 0000000000000000000000000000000000000000..a1bc9b263ee9847241042e75bb9d34a6562b6514 Binary files /dev/null and b/assets/tum/1311868170.663430.png differ diff --git a/assets/tum/1311868170.763453.png b/assets/tum/1311868170.763453.png new file mode 100644 index 0000000000000000000000000000000000000000..24d1d05a41c63f94fba7b5898af5585cfe424ae5 Binary files /dev/null and b/assets/tum/1311868170.763453.png differ diff --git a/assets/tum/1311868170.863446.png b/assets/tum/1311868170.863446.png new file mode 100644 index 0000000000000000000000000000000000000000..116ccaa5219cd70d4efc5d182dfeca234c642016 Binary files /dev/null and b/assets/tum/1311868170.863446.png differ diff --git a/assets/tum/1311868170.963440.png b/assets/tum/1311868170.963440.png new file mode 100644 index 0000000000000000000000000000000000000000..08e856cc5ab8dd706d8a3580108ab635cc94daeb Binary files /dev/null and b/assets/tum/1311868170.963440.png differ diff --git a/assets/tum/1311868171.063438.png b/assets/tum/1311868171.063438.png new file mode 100644 index 0000000000000000000000000000000000000000..9a727edb42e9fd08fd3f8bf1cfbb605d7d6ee426 Binary files /dev/null and b/assets/tum/1311868171.063438.png differ diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..5c217ac724e81fe2fd94d0a0290f530270362d74 --- /dev/null +++ b/demo.py @@ -0,0 +1,167 @@ +import copy +import os +import cv2 +import glob +import logging +import argparse +import numpy as np +from tqdm import tqdm +from alike import ALike, configs + + +class ImageLoader(object): + def __init__(self, filepath: str): + self.N = 3000 + if filepath.startswith('camera'): + camera = int(filepath[6:]) + self.cap = cv2.VideoCapture(camera) + if not self.cap.isOpened(): + raise IOError(f"Can't open camera {camera}!") + logging.info(f'Opened camera {camera}') + self.mode = 'camera' + elif os.path.exists(filepath): + if os.path.isfile(filepath): + self.cap = cv2.VideoCapture(filepath) + if not self.cap.isOpened(): + raise IOError(f"Can't open video {filepath}!") + rate = self.cap.get(cv2.CAP_PROP_FPS) + self.N = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1 + duration = self.N / rate + logging.info(f'Opened video {filepath}') + logging.info(f'Frames: {self.N}, FPS: {rate}, Duration: {duration}s') + self.mode = 'video' + else: + self.images = glob.glob(os.path.join(filepath, '*.png')) + \ + glob.glob(os.path.join(filepath, '*.jpg')) + \ + glob.glob(os.path.join(filepath, '*.ppm')) + self.images.sort() + self.N = len(self.images) + logging.info(f'Loading {self.N} images') + self.mode = 'images' + else: + raise IOError('Error filepath (camerax/path of images/path of videos): ', filepath) + + def __getitem__(self, item): + if self.mode == 'camera' or self.mode == 'video': + if item > self.N: + return None + ret, img = self.cap.read() + if not ret: + raise "Can't read image from camera" + if self.mode == 'video': + self.cap.set(cv2.CAP_PROP_POS_FRAMES, item) + elif self.mode == 'images': + filename = self.images[item] + img = cv2.imread(filename) + if img is None: + raise Exception('Error reading image %s' % filename) + + return img + + def __len__(self): + return self.N + + +class SimpleTracker(object): + def __init__(self): + self.pts_prev = None + self.desc_prev = None + + def update(self, img, pts, desc): + N_matches = 0 + if self.pts_prev is None: + self.pts_prev = pts + self.desc_prev = desc + + out = copy.deepcopy(img) + for pt1 in pts: + p1 = (int(round(pt1[0])), int(round(pt1[1]))) + cv2.circle(out, p1, 1, (0, 0, 255), -1, lineType=16) + else: + matches = self.mnn_mather(self.desc_prev, desc) + mpts1, mpts2 = self.pts_prev[matches[:, 0]], pts[matches[:, 1]] + N_matches = len(matches) + + out = copy.deepcopy(img) + for pt1, pt2 in zip(mpts1, mpts2): + p1 = (int(round(pt1[0])), int(round(pt1[1]))) + p2 = (int(round(pt2[0])), int(round(pt2[1]))) + cv2.line(out, p1, p2, (0, 255, 0), lineType=16) + cv2.circle(out, p2, 1, (0, 0, 255), -1, lineType=16) + + self.pts_prev = pts + self.desc_prev = desc + + return out, N_matches + + def mnn_mather(self, desc1, desc2): + sim = desc1 @ desc2.transpose() + sim[sim < 0.9] = 0 + nn12 = np.argmax(sim, axis=1) + nn21 = np.argmax(sim, axis=0) + ids1 = np.arange(0, sim.shape[0]) + mask = (ids1 == nn21[nn12]) + matches = np.stack([ids1[mask], nn12[mask]]) + return matches.transpose() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ALike Demo.') + parser.add_argument('input', type=str, default='', + help='Image directory or movie file or "camera0" (for webcam0).') + parser.add_argument('--model', choices=['alike-t', 'alike-s', 'alike-n', 'alike-l'], default="alike-t", + help="The model configuration") + parser.add_argument('--device', type=str, default='cuda', help="Running device (default: cuda).") + parser.add_argument('--top_k', type=int, default=-1, + help='Detect top K keypoints. -1 for threshold based mode, >0 for top K mode. (default: -1)') + parser.add_argument('--scores_th', type=float, default=0.2, + help='Detector score threshold (default: 0.2).') + parser.add_argument('--n_limit', type=int, default=5000, + help='Maximum number of keypoints to be detected (default: 5000).') + parser.add_argument('--no_display', action='store_true', + help='Do not display images to screen. Useful if running remotely (default: False).') + parser.add_argument('--no_sub_pixel', action='store_true', + help='Do not detect sub-pixel keypoints (default: False).') + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO) + + image_loader = ImageLoader(args.input) + model = ALike(**configs[args.model], + device=args.device, + top_k=args.top_k, + scores_th=args.scores_th, + n_limit=args.n_limit) + tracker = SimpleTracker() + + if not args.no_display: + logging.info("Press 'q' to stop!") + cv2.namedWindow(args.model) + + runtime = [] + progress_bar = tqdm(image_loader) + for img in progress_bar: + if img is None: + break + + pred = model(img, sub_pixel=not args.no_sub_pixel) + kpts = pred['keypoints'] + desc = pred['descriptors'] + runtime.append(pred['time']) + + out, N_matches = tracker.update(img, kpts, desc) + + ave_fps = (1. / np.stack(runtime)).mean() + status = f"Fps:{ave_fps:.1f}, Keypoints/Matches: {len(kpts)}/{N_matches}" + progress_bar.set_description(status) + + if not args.no_display: + cv2.setWindowTitle(args.model, args.model + ': ' + status) + cv2.imshow(args.model, out) + if cv2.waitKey(1) == ord('q'): + break + + logging.info('Finished!') + if not args.no_display: + logging.info('Press any key to exit!') + cv2.waitKey() diff --git a/hseq/cache/alike-l-ms.npy b/hseq/cache/alike-l-ms.npy new file mode 100644 index 0000000000000000000000000000000000000000..1aaa69e3bc085f76b48085ec517373bb43828cae Binary files /dev/null and b/hseq/cache/alike-l-ms.npy differ diff --git a/hseq/cache/alike-l.npy b/hseq/cache/alike-l.npy new file mode 100644 index 0000000000000000000000000000000000000000..b5ed534e8c811ba25bcaec01af1965d0d72efe86 Binary files /dev/null and b/hseq/cache/alike-l.npy differ diff --git a/hseq/cache/alike-n-ms.npy b/hseq/cache/alike-n-ms.npy new file mode 100644 index 0000000000000000000000000000000000000000..e1e454c5c302ab06bbe094ea940a9a4299db336b Binary files /dev/null and b/hseq/cache/alike-n-ms.npy differ diff --git a/hseq/cache/alike-n.npy b/hseq/cache/alike-n.npy new file mode 100644 index 0000000000000000000000000000000000000000..f820d6bfc6e6d05f9ca3600245107b00addcf639 Binary files /dev/null and b/hseq/cache/alike-n.npy differ diff --git a/hseq/cache/aslfeat.npy b/hseq/cache/aslfeat.npy new file mode 100644 index 0000000000000000000000000000000000000000..1343ccf83a9c2b8509f5b7ea1ef23d924d60f711 Binary files /dev/null and b/hseq/cache/aslfeat.npy differ diff --git a/hseq/cache/d2.npy b/hseq/cache/d2.npy new file mode 100644 index 0000000000000000000000000000000000000000..451d8d8a95dc29349e0a327872043fc43da27a70 Binary files /dev/null and b/hseq/cache/d2.npy differ diff --git a/hseq/cache/disk.npy b/hseq/cache/disk.npy new file mode 100644 index 0000000000000000000000000000000000000000..2c1a1f62bdae9cb4f6ccee8b2129f0ef234fab42 Binary files /dev/null and b/hseq/cache/disk.npy differ diff --git a/hseq/cache/lfnet.npy b/hseq/cache/lfnet.npy new file mode 100644 index 0000000000000000000000000000000000000000..79f1ef28f1e4992a6763ecc2147f7e3788021fb7 Binary files /dev/null and b/hseq/cache/lfnet.npy differ diff --git a/hseq/cache/r2d2.npy b/hseq/cache/r2d2.npy new file mode 100644 index 0000000000000000000000000000000000000000..3fb85f5544b51d7b8839d1d20717fe4dcc207c6a Binary files /dev/null and b/hseq/cache/r2d2.npy differ diff --git a/hseq/cache/superpoint.npy b/hseq/cache/superpoint.npy new file mode 100644 index 0000000000000000000000000000000000000000..89f27dc78d6ac1b69b3b8ffafdf2b69ff13aeba5 Binary files /dev/null and b/hseq/cache/superpoint.npy differ diff --git a/hseq/eval.py b/hseq/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..8ce64613110635576935129ebebc583fc388ac8a --- /dev/null +++ b/hseq/eval.py @@ -0,0 +1,162 @@ +import cv2 +import os +from tqdm import tqdm +import torch +import numpy as np +from extract import extract_method + +use_cuda = torch.cuda.is_available() +device = torch.device('cuda' if use_cuda else 'cpu') + +methods = ['d2', 'lfnet', 'superpoint', 'r2d2', 'aslfeat', 'disk', + 'alike-n', 'alike-l', 'alike-n-ms', 'alike-l-ms'] +names = ['D2-Net(MS)', 'LF-Net(MS)', 'SuperPoint', 'R2D2(MS)', 'ASLFeat(MS)', 'DISK', + 'ALike-N', 'ALike-L', 'ALike-N(MS)', 'ALike-L(MS)'] + +top_k = None +n_i = 52 +n_v = 56 +cache_dir = 'hseq/cache' +dataset_path = 'hseq/hpatches-sequences-release' + + +def generate_read_function(method, extension='ppm'): + def read_function(seq_name, im_idx): + aux = np.load(os.path.join(dataset_path, seq_name, '%d.%s.%s' % (im_idx, extension, method))) + if top_k is None: + return aux['keypoints'], aux['descriptors'] + else: + assert ('scores' in aux) + ids = np.argsort(aux['scores'])[-top_k:] + return aux['keypoints'][ids, :], aux['descriptors'][ids, :] + + return read_function + + +def mnn_matcher(descriptors_a, descriptors_b): + device = descriptors_a.device + sim = descriptors_a @ descriptors_b.t() + nn12 = torch.max(sim, dim=1)[1] + nn21 = torch.max(sim, dim=0)[1] + ids1 = torch.arange(0, sim.shape[0], device=device) + mask = (ids1 == nn21[nn12]) + matches = torch.stack([ids1[mask], nn12[mask]]) + return matches.t().data.cpu().numpy() + + +def homo_trans(coord, H): + kpt_num = coord.shape[0] + homo_coord = np.concatenate((coord, np.ones((kpt_num, 1))), axis=-1) + proj_coord = np.matmul(H, homo_coord.T).T + proj_coord = proj_coord / proj_coord[:, 2][..., None] + proj_coord = proj_coord[:, 0:2] + return proj_coord + + +def benchmark_features(read_feats): + lim = [1, 5] + rng = np.arange(lim[0], lim[1] + 1) + + seq_names = sorted(os.listdir(dataset_path)) + + n_feats = [] + n_matches = [] + seq_type = [] + i_err = {thr: 0 for thr in rng} + v_err = {thr: 0 for thr in rng} + + i_err_homo = {thr: 0 for thr in rng} + v_err_homo = {thr: 0 for thr in rng} + + for seq_idx, seq_name in tqdm(enumerate(seq_names), total=len(seq_names)): + keypoints_a, descriptors_a = read_feats(seq_name, 1) + n_feats.append(keypoints_a.shape[0]) + + # =========== compute homography + ref_img = cv2.imread(os.path.join(dataset_path, seq_name, '1.ppm')) + ref_img_shape = ref_img.shape + + for im_idx in range(2, 7): + keypoints_b, descriptors_b = read_feats(seq_name, im_idx) + n_feats.append(keypoints_b.shape[0]) + + matches = mnn_matcher( + torch.from_numpy(descriptors_a).to(device=device), + torch.from_numpy(descriptors_b).to(device=device) + ) + + homography = np.loadtxt(os.path.join(dataset_path, seq_name, "H_1_" + str(im_idx))) + + pos_a = keypoints_a[matches[:, 0], : 2] + pos_a_h = np.concatenate([pos_a, np.ones([matches.shape[0], 1])], axis=1) + pos_b_proj_h = np.transpose(np.dot(homography, np.transpose(pos_a_h))) + pos_b_proj = pos_b_proj_h[:, : 2] / pos_b_proj_h[:, 2:] + + pos_b = keypoints_b[matches[:, 1], : 2] + + dist = np.sqrt(np.sum((pos_b - pos_b_proj) ** 2, axis=1)) + + n_matches.append(matches.shape[0]) + seq_type.append(seq_name[0]) + + if dist.shape[0] == 0: + dist = np.array([float("inf")]) + + for thr in rng: + if seq_name[0] == 'i': + i_err[thr] += np.mean(dist <= thr) + else: + v_err[thr] += np.mean(dist <= thr) + + # =========== compute homography + gt_homo = homography + pred_homo, _ = cv2.findHomography(keypoints_a[matches[:, 0], : 2], keypoints_b[matches[:, 1], : 2], + cv2.RANSAC) + if pred_homo is None: + homo_dist = np.array([float("inf")]) + else: + corners = np.array([[0, 0], + [ref_img_shape[1] - 1, 0], + [0, ref_img_shape[0] - 1], + [ref_img_shape[1] - 1, ref_img_shape[0] - 1]]) + real_warped_corners = homo_trans(corners, gt_homo) + warped_corners = homo_trans(corners, pred_homo) + homo_dist = np.mean(np.linalg.norm(real_warped_corners - warped_corners, axis=1)) + + for thr in rng: + if seq_name[0] == 'i': + i_err_homo[thr] += np.mean(homo_dist <= thr) + else: + v_err_homo[thr] += np.mean(homo_dist <= thr) + + seq_type = np.array(seq_type) + n_feats = np.array(n_feats) + n_matches = np.array(n_matches) + + return i_err, v_err, i_err_homo, v_err_homo, [seq_type, n_feats, n_matches] + + +if __name__ == '__main__': + errors = {} + for method in methods: + output_file = os.path.join(cache_dir, method + '.npy') + read_function = generate_read_function(method) + if os.path.exists(output_file): + errors[method] = np.load(output_file, allow_pickle=True) + else: + extract_method(method) + errors[method] = benchmark_features(read_function) + np.save(output_file, errors[method]) + + for name, method in zip(names, methods): + i_err, v_err, i_err_hom, v_err_hom, _ = errors[method] + + print(f"====={name}=====") + print(f"MMA@1 MMA@5 MMA@3 MHA@1 MHA@5 MHA@3: ", end='') + for thr in range(1, 4): + err = (i_err[thr] + v_err[thr]) / ((n_i + n_v) * 5) + print(f"{err * 100:.2f}%", end=' ') + for thr in range(1, 4): + err_hom = (i_err_hom[thr] + v_err_hom[thr]) / ((n_i + n_v) * 5) + print(f"{err_hom * 100:.2f}%", end=' ') + print('') diff --git a/hseq/extract.py b/hseq/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..1342e40dd2d0e1d1986e90f995c95b17972ec4e1 --- /dev/null +++ b/hseq/extract.py @@ -0,0 +1,159 @@ +import os +import sys +import cv2 +from pathlib import Path +import numpy as np +import torch +import torch.utils.data as data +from tqdm import tqdm +from copy import deepcopy +from torchvision.transforms import ToTensor + +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from alike import ALike, configs + +dataset_root = 'hseq/hpatches-sequences-release' +use_cuda = torch.cuda.is_available() +device = 'cuda' if use_cuda else 'cpu' +methods = ['alike-n', 'alike-l', 'alike-n-ms', 'alike-l-ms'] + + +class HPatchesDataset(data.Dataset): + def __init__(self, root: str = dataset_root, alteration: str = 'all'): + """ + Args: + root: dataset root path + alteration: # 'all', 'i' for illumination or 'v' for viewpoint + """ + assert (Path(root).exists()), f"Dataset root path {root} dose not exist!" + self.root = root + + # get all image file name + self.image0_list = [] + self.image1_list = [] + self.homographies = [] + folders = [x for x in Path(self.root).iterdir() if x.is_dir()] + self.seqs = [] + for folder in folders: + if alteration == 'i' and folder.stem[0] != 'i': + continue + if alteration == 'v' and folder.stem[0] != 'v': + continue + + self.seqs.append(folder) + + self.len = len(self.seqs) + assert (self.len > 0), f'Can not find PatchDataset in path {self.root}' + + def __getitem__(self, item): + folder = self.seqs[item] + + imgs = [] + homos = [] + for i in range(1, 7): + img = cv2.imread(str(folder / f'{i}.ppm'), cv2.IMREAD_COLOR) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # HxWxC + imgs.append(img) + + if i != 1: + homo = np.loadtxt(str(folder / f'H_1_{i}')).astype('float32') + homos.append(homo) + + return imgs, homos, folder.stem + + def __len__(self): + return self.len + + def name(self): + return self.__class__ + + +def extract_multiscale(model, img, scale_f=2 ** 0.5, + min_scale=1., max_scale=1., + min_size=0., max_size=99999., + image_size_max=99999, + n_k=0, sort=False): + H_, W_, three = img.shape + assert three == 3, "input image shape should be [HxWx3]" + + old_bm = torch.backends.cudnn.benchmark + torch.backends.cudnn.benchmark = False # speedup + + # ==================== image size constraint + image = deepcopy(img) + max_hw = max(H_, W_) + if max_hw > image_size_max: + ratio = float(image_size_max / max_hw) + image = cv2.resize(image, dsize=None, fx=ratio, fy=ratio) + + # ==================== convert image to tensor + H, W, three = image.shape + image = ToTensor()(image).unsqueeze(0) + image = image.to(device) + + s = 1.0 # current scale factor + keypoints, descriptors, scores, scores_maps, descriptor_maps = [], [], [], [], [] + while s + 0.001 >= max(min_scale, min_size / max(H, W)): + if s - 0.001 <= min(max_scale, max_size / max(H, W)): + nh, nw = image.shape[2:] + + # extract descriptors + with torch.no_grad(): + descriptor_map, scores_map = model.extract_dense_map(image) + keypoints_, descriptors_, scores_, _ = model.dkd(scores_map, descriptor_map) + + keypoints.append(keypoints_[0]) + descriptors.append(descriptors_[0]) + scores.append(scores_[0]) + + s /= scale_f + + # down-scale the image for next iteration + nh, nw = round(H * s), round(W * s) + image = torch.nn.functional.interpolate(image, (nh, nw), mode='bilinear', align_corners=False) + + # restore value + torch.backends.cudnn.benchmark = old_bm + + keypoints = torch.cat(keypoints) + descriptors = torch.cat(descriptors) + scores = torch.cat(scores) + keypoints = (keypoints + 1) / 2 * keypoints.new_tensor([[W_ - 1, H_ - 1]]) + + if sort or 0 < n_k < len(keypoints): + indices = torch.argsort(scores, descending=True) + keypoints = keypoints[indices] + descriptors = descriptors[indices] + scores = scores[indices] + + if 0 < n_k < len(keypoints): + keypoints = keypoints[0:n_k] + descriptors = descriptors[0:n_k] + scores = scores[0:n_k] + + return {'keypoints': keypoints, 'descriptors': descriptors, 'scores': scores} + + +def extract_method(m): + hpatches = HPatchesDataset(root=dataset_root, alteration='all') + model = m[:7] + min_scale = 0.3 if m[8:] == 'ms' else 1.0 + + model = ALike(**configs[model], device=device, top_k=0, scores_th=0.2, n_limit=5000) + + progbar = tqdm(hpatches, desc='Extracting for {}'.format(m)) + for imgs, homos, seq_name in progbar: + for i in range(1, 7): + img = imgs[i - 1] + pred = extract_multiscale(model, img, min_scale=min_scale, max_scale=1, sort=False, n_k=5000) + kpts, descs, scores = pred['keypoints'], pred['descriptors'], pred['scores'] + + with open(os.path.join(dataset_root, seq_name, f'{i}.ppm.{m}'), 'wb') as f: + np.savez(f, keypoints=kpts.cpu().numpy(), + scores=scores.cpu().numpy(), + descriptors=descs.cpu().numpy()) + + +if __name__ == '__main__': + for method in methods: + extract_method(method) diff --git a/matlab/createfigure.m b/matlab/createfigure.m new file mode 100644 index 0000000000000000000000000000000000000000..038090c7e570aeaed25bd4dfaffb71134d707082 --- /dev/null +++ b/matlab/createfigure.m @@ -0,0 +1,75 @@ +function createfigure(X1, YMatrix1, Y1, l1, l2, l3) +%CREATEFIGURE(X1, YMatrix1, Y1) +% X1: vector of x data +% YMATRIX1: matrix of y data +% Y1: vector of y data + +% Auto-generated by MATLAB on 29-Oct-2021 15:42:14 + +% Create figure +figure1 = figure; + +% Create axes +axes1 = axes('Parent',figure1); +hold(axes1,'on'); + +% Create multiple lines using matrix input to plot +plot1 = plot(X1,YMatrix1,'Parent',axes1,'LineWidth',1); +set(plot1(1),'LineStyle','-.','Color',[1 0 0]); +set(plot1(2),'Color',[0 1 0]); +set(plot1(3),'LineStyle','--',... + 'Color',[0.87058824300766 0.490196079015732 0]); + +% Uncomment the following line to preserve the X-limits of the axes +% xlim(axes1,[-1.1 1.1]); +% Uncomment the following line to preserve the Y-limits of the axes +ylim(axes1,[0 2.2]); +box(axes1,'on'); +hold(axes1,'off'); +% Set the remaining axes properties +set(axes1,'XColor',[0 0 0],'YColor',[0 0 0],'YTick',[0 0.5 1 1.5 2 2.5]); +% Create axes +axes2 = axes('Parent',figure1); +hold(axes2,'on'); +colororder([0.494 0.184 0.556;0.466 0.674 0.188;0.301 0.745 0.933;0.635 0.078 0.184;0 0.447 0.741;0.85 0.325 0.098;0.929 0.694 0.125]); + +% Create plot +plot(X1,Y1,'Parent',axes2,'LineWidth',1,'LineStyle',':','Color',[0 0 1]); + +% Uncomment the following line to preserve the X-limits of the axes +% xlim(axes2,[-1.1 1.1]); +% Uncomment the following line to preserve the Y-limits of the axes +ylim(axes2,[0 1.6]); +hold(axes2,'off'); +% Set the remaining axes properties +set(axes2,'Color','none','HitTest','off','XColor',[0 0 0],'YAxisLocation',... + 'right','YColor',[0 0 0],'YTick',[0 0.5 1 1.5]); +% Create textbox +annotation(figure1,'textbox',... + [0.255427607968038,0.605539475745798,0.304947448327989,0.235148519909872],... + 'Color',[0.8 0 0],... + 'String',{sprintf('peak loss=%.4f',l1)},... + 'EdgeColor','none'); + +% Create textbox +annotation(figure1,'textbox',... + [0.631790371410027,0.083530640355914,0.178879315581032,0.235148519909871],... + 'Color',[0 0 1],... + 'String',{'keypoint'},... + 'EdgeColor','none'); + +% Create textbox +annotation(figure1,'textbox',... + [0.59663112557549,0.640686239621974,0.318247136419826,0.22093023731067],... + 'Color',[0 0.498039215803146 0],... + 'String',{sprintf('peak loss=%.4f',l2)},... + 'EdgeColor','none'); + +% Create textbox +annotation(figure1,'textbox',... + [0.595423071596731,0.415858983920567,0.318247136419826,0.235148519909871],... + 'Color',[0.87058824300766 0.490196079015732 0],... + 'String',{sprintf('peak loss=%.4f',l3)},... + 'FitBoxToText','off',... + 'EdgeColor','none'); + diff --git a/matlab/peakloss_rect.m b/matlab/peakloss_rect.m new file mode 100644 index 0000000000000000000000000000000000000000..fa0d811c126aec1d6f6868352d89be69ea351577 --- /dev/null +++ b/matlab/peakloss_rect.m @@ -0,0 +1,19 @@ +clear; +close all; + +x = -1:0.01:1; + +p0 = 0.5; +p1 = -0.5; + +d = abs(x - p0); + +c0 = 2 .* (x>=-0.75 & x <= -0.25); +c1 = 2 .* (x>=0.25 & x <= 0.75); +c2 = 1.25 .* (x>=0.1 & x <= 0.9); + +peak_loss0 = sum(d.*c0) / length(x) +peak_loss1 = sum(d.*c1) / length(x) +peak_loss2 = sum(d.*c2) / length(x) + +createfigure(x, [c0;c1;c2], d, peak_loss0,peak_loss1, peak_loss2); \ No newline at end of file diff --git a/models/alike-s.pth b/models/alike-s.pth new file mode 100644 index 0000000000000000000000000000000000000000..98afc79d8ab654c5a073d4b75b6aafc894015572 Binary files /dev/null and b/models/alike-s.pth differ diff --git a/models/alike-t.pth b/models/alike-t.pth new file mode 100644 index 0000000000000000000000000000000000000000..e91c9dbe8e2831298270a2f79f471165df816ad2 Binary files /dev/null and b/models/alike-t.pth differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..14ca745ea1572bda6b2bd7c4eb88bb026b566781 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +opencv-python~=4.5.1.48 +numpy~=1.19.5 +tqdm~=4.60.0 +torch~=1.8.0 +torchvision~=0.9.0 +thop~=0.0.31-2005241907 \ No newline at end of file diff --git a/soft_detect.py b/soft_detect.py new file mode 100644 index 0000000000000000000000000000000000000000..2d23cd13b8a7db9b0398fdc1b235564222d30c90 --- /dev/null +++ b/soft_detect.py @@ -0,0 +1,194 @@ +import torch +from torch import nn +import torch.nn.functional as F + + +# coordinates system +# ------------------------------> [ x: range=-1.0~1.0; w: range=0~W ] +# | ----------------------------- +# | | | +# | | | +# | | | +# | | image | +# | | | +# | | | +# | | | +# | |---------------------------| +# v +# [ y: range=-1.0~1.0; h: range=0~H ] + +def simple_nms(scores, nms_radius: int): + """ Fast Non-maximum suppression to remove nearby points """ + assert (nms_radius >= 0) + + def max_pool(x): + return torch.nn.functional.max_pool2d( + x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius) + + zeros = torch.zeros_like(scores) + max_mask = scores == max_pool(scores) + + for _ in range(2): + supp_mask = max_pool(max_mask.float()) > 0 + supp_scores = torch.where(supp_mask, zeros, scores) + new_max_mask = supp_scores == max_pool(supp_scores) + max_mask = max_mask | (new_max_mask & (~supp_mask)) + return torch.where(max_mask, scores, zeros) + + +def sample_descriptor(descriptor_map, kpts, bilinear_interp=False): + """ + :param descriptor_map: BxCxHxW + :param kpts: list, len=B, each is Nx2 (keypoints) [h,w] + :param bilinear_interp: bool, whether to use bilinear interpolation + :return: descriptors: list, len=B, each is NxD + """ + batch_size, channel, height, width = descriptor_map.shape + + descriptors = [] + for index in range(batch_size): + kptsi = kpts[index] # Nx2,(x,y) + + if bilinear_interp: + descriptors_ = torch.nn.functional.grid_sample(descriptor_map[index].unsqueeze(0), kptsi.view(1, 1, -1, 2), + mode='bilinear', align_corners=True)[0, :, 0, :] # CxN + else: + kptsi = (kptsi + 1) / 2 * kptsi.new_tensor([[width - 1, height - 1]]) + kptsi = kptsi.long() + descriptors_ = descriptor_map[index, :, kptsi[:, 1], kptsi[:, 0]] # CxN + + descriptors_ = torch.nn.functional.normalize(descriptors_, p=2, dim=0) + descriptors.append(descriptors_.t()) + + return descriptors + + +class DKD(nn.Module): + def __init__(self, radius=2, top_k=0, scores_th=0.2, n_limit=20000): + """ + Args: + radius: soft detection radius, kernel size is (2 * radius + 1) + top_k: top_k > 0: return top k keypoints + scores_th: top_k <= 0 threshold mode: scores_th > 0: return keypoints with scores>scores_th + else: return keypoints with scores > scores.mean() + n_limit: max number of keypoint in threshold mode + """ + super().__init__() + self.radius = radius + self.top_k = top_k + self.scores_th = scores_th + self.n_limit = n_limit + self.kernel_size = 2 * self.radius + 1 + self.temperature = 0.1 # tuned temperature + self.unfold = nn.Unfold(kernel_size=self.kernel_size, padding=self.radius) + + # local xy grid + x = torch.linspace(-self.radius, self.radius, self.kernel_size) + # (kernel_size*kernel_size) x 2 : (w,h) + self.hw_grid = torch.stack(torch.meshgrid([x, x])).view(2, -1).t()[:, [1, 0]] + + def detect_keypoints(self, scores_map, sub_pixel=True): + b, c, h, w = scores_map.shape + scores_nograd = scores_map.detach() + # nms_scores = simple_nms(scores_nograd, self.radius) + nms_scores = simple_nms(scores_nograd, 2) + + # remove border + nms_scores[:, :, :self.radius + 1, :] = 0 + nms_scores[:, :, :, :self.radius + 1] = 0 + nms_scores[:, :, h - self.radius:, :] = 0 + nms_scores[:, :, :, w - self.radius:] = 0 + + # detect keypoints without grad + if self.top_k > 0: + topk = torch.topk(nms_scores.view(b, -1), self.top_k) + indices_keypoints = topk.indices # B x top_k + else: + if self.scores_th > 0: + masks = nms_scores > self.scores_th + if masks.sum() == 0: + th = scores_nograd.reshape(b, -1).mean(dim=1) # th = self.scores_th + masks = nms_scores > th.reshape(b, 1, 1, 1) + else: + th = scores_nograd.reshape(b, -1).mean(dim=1) # th = self.scores_th + masks = nms_scores > th.reshape(b, 1, 1, 1) + masks = masks.reshape(b, -1) + + indices_keypoints = [] # list, B x (any size) + scores_view = scores_nograd.reshape(b, -1) + for mask, scores in zip(masks, scores_view): + indices = mask.nonzero(as_tuple=False)[:, 0] + if len(indices) > self.n_limit: + kpts_sc = scores[indices] + sort_idx = kpts_sc.sort(descending=True)[1] + sel_idx = sort_idx[:self.n_limit] + indices = indices[sel_idx] + indices_keypoints.append(indices) + + keypoints = [] + scoredispersitys = [] + kptscores = [] + if sub_pixel: + # detect soft keypoints with grad backpropagation + patches = self.unfold(scores_map) # B x (kernel**2) x (H*W) + self.hw_grid = self.hw_grid.to(patches) # to device + for b_idx in range(b): + patch = patches[b_idx].t() # (H*W) x (kernel**2) + indices_kpt = indices_keypoints[b_idx] # one dimension vector, say its size is M + patch_scores = patch[indices_kpt] # M x (kernel**2) + + # max is detached to prevent undesired backprop loops in the graph + max_v = patch_scores.max(dim=1).values.detach()[:, None] + x_exp = ((patch_scores - max_v) / self.temperature).exp() # M * (kernel**2), in [0, 1] + + # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} } + xy_residual = x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None] # Soft-argmax, Mx2 + + hw_grid_dist2 = torch.norm((self.hw_grid[None, :, :] - xy_residual[:, None, :]) / self.radius, + dim=-1) ** 2 + scoredispersity = (x_exp * hw_grid_dist2).sum(dim=1) / x_exp.sum(dim=1) + + # compute result keypoints + keypoints_xy_nms = torch.stack([indices_kpt % w, indices_kpt // w], dim=1) # Mx2 + keypoints_xy = keypoints_xy_nms + xy_residual + keypoints_xy = keypoints_xy / keypoints_xy.new_tensor( + [w - 1, h - 1]) * 2 - 1 # (w,h) -> (-1~1,-1~1) + + kptscore = torch.nn.functional.grid_sample(scores_map[b_idx].unsqueeze(0), + keypoints_xy.view(1, 1, -1, 2), + mode='bilinear', align_corners=True)[0, 0, 0, :] # CxN + + keypoints.append(keypoints_xy) + scoredispersitys.append(scoredispersity) + kptscores.append(kptscore) + else: + for b_idx in range(b): + indices_kpt = indices_keypoints[b_idx] # one dimension vector, say its size is M + keypoints_xy_nms = torch.stack([indices_kpt % w, indices_kpt // w], dim=1) # Mx2 + keypoints_xy = keypoints_xy_nms / keypoints_xy_nms.new_tensor( + [w - 1, h - 1]) * 2 - 1 # (w,h) -> (-1~1,-1~1) + kptscore = torch.nn.functional.grid_sample(scores_map[b_idx].unsqueeze(0), + keypoints_xy.view(1, 1, -1, 2), + mode='bilinear', align_corners=True)[0, 0, 0, :] # CxN + keypoints.append(keypoints_xy) + scoredispersitys.append(None) + kptscores.append(kptscore) + + return keypoints, scoredispersitys, kptscores + + def forward(self, scores_map, descriptor_map, sub_pixel=False): + """ + :param scores_map: Bx1xHxW + :param descriptor_map: BxCxHxW + :param sub_pixel: whether to use sub-pixel keypoint detection + :return: kpts: list[Nx2,...]; kptscores: list[N,....] normalised position: -1.0 ~ 1.0 + """ + keypoints, scoredispersitys, kptscores = self.detect_keypoints(scores_map, + sub_pixel) + + descriptors = sample_descriptor(descriptor_map, keypoints, sub_pixel) + + # keypoints: B M 2 + # descriptors: B M D + # scoredispersitys: + return keypoints, descriptors, kptscores, scoredispersitys